Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on 28 days ago

Commit

f94920a

verified ·

1 Parent(s): 9ebbf94

Upload 55 files

Browse files

Files changed (9) hide show

NER/html/extractHTML.py +363 -363
app.py +0 -0
better_offer.html +201 -201
data_preprocess.py +876 -876
model.py +0 -0
mtdna_backend.py +1004 -1144
mtdna_classifier.py +768 -768
pipeline.py +0 -0
smart_fallback.py +401 -401

NER/html/extractHTML.py CHANGED Viewed

@@ -1,364 +1,364 @@
-# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
-from bs4 import BeautifulSoup
-import requests
-from DefaultPackages import openFile, saveFile
-from NER import cleanText
-import pandas as pd
-from lxml.etree import ParserError, XMLSyntaxError
-import aiohttp
-import asyncio
-class HTML():
-  def __init__(self, htmlFile, htmlLink, htmlContent: str=None):
-    self.htmlLink = htmlLink
-    self.htmlFile = htmlFile
-    self.htmlContent = htmlContent  # NEW: store raw HTML if provided
-  def fetch_crossref_metadata(self, doi):
-    """Fetch metadata from CrossRef API for a given DOI."""
-    try:
-        url = f"https://api.crossref.org/works/{doi}"
-        r = requests.get(url, timeout=10)
-        if r.status_code == 200:
-            return r.json().get("message", {})
-        else:
-            print(f"⚠️ CrossRef fetch failed ({r.status_code}) for DOI: {doi}")
-            return {}
-    except Exception as e:
-        print(f"❌ CrossRef exception: {e}")
-        return {}
-  # def openHTMLFile(self):
-  #   headers = {
-  #       "User-Agent": (
-  #           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-  #           "AppleWebKit/537.36 (KHTML, like Gecko) "
-  #           "Chrome/114.0.0.0 Safari/537.36"
-  #       ),
-  #       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-  #       "Referer": self.htmlLink,
-  #       "Connection": "keep-alive"
-  #   }
-  #   session = requests.Session()
-  #   session.headers.update(headers)
-  #   if self.htmlLink != "None":
-  #       try:
-  #           r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
-  #           if r.status_code != 200:
-  #               print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
-  #               return BeautifulSoup("", 'html.parser')
-  #           soup = BeautifulSoup(r.content, 'html.parser')
-  #       except Exception as e:
-  #           print(f"❌ Exception fetching HTML: {e}")
-  #           return BeautifulSoup("", 'html.parser')
-  #   else:
-  #       with open(self.htmlFile) as fp:
-  #           soup = BeautifulSoup(fp, 'html.parser')
-  #   return soup
-  def openHTMLFile(self):
-      """Return a BeautifulSoup object from cached htmlContent, file, or requests."""
-      # If raw HTML already provided (from async aiohttp), use it directly
-      if self.htmlContent is not None:
-          return BeautifulSoup(self.htmlContent, "html.parser")
-      not_need_domain = ['https://broadinstitute.github.io/picard/',
-              'https://software.broadinstitute.org/gatk/best-practices/',
-              'https://www.ncbi.nlm.nih.gov/genbank/',
-              'https://www.mitomap.org/']
-      if self.htmlLink in not_need_domain:
-        return BeautifulSoup("", 'html.parser')
-      headers = {
-          "User-Agent": (
-              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-              "AppleWebKit/537.36 (KHTML, like Gecko) "
-              "Chrome/114.0.0.0 Safari/537.36"
-          ),
-          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-          "Accept-Language": "en-US,en;q=0.9",
-          "Referer": "https://www.google.com/",
-          #"Referer": self.htmlLink,
-          "Connection": "keep-alive"
-      }
-      session = requests.Session()
-      session.headers.update(headers)
-      try:
-          if self.htmlLink and self.htmlLink != "None":
-              r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
-              if r.status_code != 200 or not r.text.strip():
-                  print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
-                  return BeautifulSoup("", 'html.parser')
-              soup = BeautifulSoup(r.content, 'html.parser')
-          elif self.htmlFile:
-              with open(self.htmlFile, encoding='utf-8') as fp:
-                  soup = BeautifulSoup(fp, 'html.parser')
-      except (ParserError, XMLSyntaxError, OSError) as e:
-          print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
-          return BeautifulSoup("", 'html.parser')
-      except Exception as e:
-          print(f"❌ General exception for {self.htmlLink}: {e}")
-          return BeautifulSoup("", 'html.parser')
-      return soup
-  async def async_fetch_html(self):
-    """Async fetch HTML content with aiohttp."""
-    not_need_domain = [
-        "https://broadinstitute.github.io/picard/",
-        "https://software.broadinstitute.org/gatk/best-practices/",
-        "https://www.ncbi.nlm.nih.gov/genbank/",
-        "https://www.mitomap.org/",
-    ]
-    if self.htmlLink in not_need_domain:
-        return ""  # Skip domains we don't need
-    headers = {
-        "User-Agent": (
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/114.0.0.0 Safari/537.36"
-        ),
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-        "Accept-Language": "en-US,en;q=0.9",
-        "Referer": "https://www.google.com/",
-        "Connection": "keep-alive",
-    }
-    try:
-        async with aiohttp.ClientSession(headers=headers) as session:
-            async with session.get(self.htmlLink, timeout=15) as resp:
-                if resp.status != 200:
-                    print(f"❌ HTML GET failed ({resp.status}) — {self.htmlLink}")
-                    return ""
-                return await resp.text()
-    except Exception as e:
-        print(f"❌ Async fetch failed for {self.htmlLink}: {e}")
-        return ""
-  @classmethod
-  async def bulk_fetch(cls, links: list[str]):
-      """Fetch multiple links concurrently, return list of HTML() objects with htmlContent filled."""
-      tasks = [cls("", link).async_fetch_html() for link in links]
-      results = await asyncio.gather(*tasks, return_exceptions=True)
-      out = []
-      for link, content in zip(links, results):
-          if isinstance(content, Exception):
-              print(f"⚠️ Exception while fetching {link}: {content}")
-              out.append(cls("", link, htmlContent=""))
-          else:
-              out.append(cls("", link, htmlContent=content))
-      return out
-  def getText(self):
-    try:
-      soup = self.openHTMLFile()
-      s = soup.find_all("html")
-      text = ""
-      if s:
-        for t in range(len(s)):
-          text = s[t].get_text()
-      cl = cleanText.cleanGenText()
-      text = cl.removeExtraSpaceBetweenWords(text)
-      return text
-    except:
-      print("failed get text from html")
-      return ""
-  async def async_getListSection(self, scienceDirect=None):
-    try:
-        json = {}
-        textJson, textHTML = "", ""
-        # Use preloaded HTML (fast path)
-        soup = self.openHTMLFile()
-        h2_tags = soup.find_all('h2')
-        for idx, h2 in enumerate(h2_tags):
-            section_title = h2.get_text(strip=True)
-            json.setdefault(section_title, [])
-            next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
-            for p in h2.find_all_next("p"):
-                if next_h2 and p == next_h2:
-                    break
-                json[section_title].append(p.get_text(strip=True))
-        # If no sections or explicitly ScienceDirect
-        if scienceDirect is not None or len(json) == 0:
-            print("async fetching ScienceDirect metadata...")
-            api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
-            doi = self.htmlLink.split("https://doi.org/")[-1]
-            base_url = f"https://api.elsevier.com/content/article/doi/{doi}"
-            headers = {"Accept": "application/json", "X-ELS-APIKey": api_key}
-            async with aiohttp.ClientSession() as session:
-                async with session.get(base_url, headers=headers, timeout=15) as resp:
-                    if resp.status == 200:
-                        data = await resp.json()
-                        if isinstance(data, dict):
-                            json["fullText"] = data
-        # Merge text
-        textJson = self.mergeTextInJson(json)
-        textHTML = self.getText()
-        return textHTML if len(textHTML) > len(textJson) else textJson
-    except Exception as e:
-        print("❌ async_getListSection failed:", e)
-        return ""
-  def getListSection(self, scienceDirect=None):
-    try:
-        json = {}
-        text = ""
-        textJson, textHTML = "",""
-        if scienceDirect == None:
-          # soup = self.openHTMLFile()
-          # # get list of section
-          # json = {}
-          # for h2Pos in range(len(soup.find_all('h2'))):
-          #   if soup.find_all('h2')[h2Pos].text not in json:
-          #     json[soup.find_all('h2')[h2Pos].text] = []
-          #   if h2Pos + 1 < len(soup.find_all('h2')):
-          #     content = soup.find_all('h2')[h2Pos].find_next("p")
-          #     nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
-          #     while content.text != nexth2Content.text:
-          #       json[soup.find_all('h2')[h2Pos].text].append(content.text)
-          #       content = content.find_next("p")
-          #   else:
-          #     content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
-          #     json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
-            soup = self.openHTMLFile()
-            h2_tags = soup.find_all('h2')
-            json = {}
-            for idx, h2 in enumerate(h2_tags):
-                section_title = h2.get_text(strip=True)
-                json.setdefault(section_title, [])
-                # Get paragraphs until next H2
-                next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
-                for p in h2.find_all_next("p"):
-                    if next_h2 and p == next_h2:
-                        break
-                    json[section_title].append(p.get_text(strip=True))
-          # format
-        '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
-        'Results':[], 'Discussion':[], 'References':[],
-        'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
-        'Additional information':[], 'Electronic supplementary material':[],
-        'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
-        if scienceDirect!= None or len(json)==0:
-          # Replace with your actual Elsevier API key
-          api_key = os.environ["SCIENCE_DIRECT_API"]
-          # ScienceDirect article DOI or PI (Example DOI)
-          doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
-          # Base URL for the Elsevier API
-          base_url = "https://api.elsevier.com/content/article/doi/"
-          # Set headers with API key
-          headers = {
-              "Accept": "application/json",
-              "X-ELS-APIKey": api_key
-          }
-          # Make the API request
-          response = requests.get(base_url + doi, headers=headers)
-    # Check if the request was successful
-          if response.status_code == 200:
-            data = response.json()
-            supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
-            # if "originalText" in list(supp_data.keys()):
-            #   if type(supp_data["originalText"])==str:
-            #     json["originalText"] = [supp_data["originalText"]]
-            #   if type(supp_data["originalText"])==dict:
-            #     json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
-            # else:
-            #   if type(supp_data)==dict:
-            #     for key in supp_data:
-            #       json[key] = [supp_data[key]]
-            if type(data)==dict:
-                json["fullText"] = data
-        textJson = self.mergeTextInJson(json)
-        textHTML = self.getText()
-        if len(textHTML) > len(textJson):
-          text = textHTML
-        else: text = textJson
-        return text #json
-    except:
-        print("failed all")
-        return ""
-  def getReference(self):
-    # get reference to collect more next data
-    ref = []
-    json = self.getListSection()
-    for key in json["References"]:
-      ct = cleanText.cleanGenText(key)
-      cleanText, filteredWord = ct.cleanText()
-      if cleanText not in ref:
-        ref.append(cleanText)
-    return ref
-  def getSupMaterial(self):
-    # check if there is material or not
-    json = {}
-    soup = self.openHTMLFile()
-    for h2Pos in range(len(soup.find_all('h2'))):
-      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
-        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
-        link, output = [],[]
-        if soup.find_all('h2')[h2Pos].text not in json:
-          json[soup.find_all('h2')[h2Pos].text] = []
-        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
-            link.append(l["href"])
-        if h2Pos + 1 < len(soup.find_all('h2')):
-          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
-          if nexth2Link in link:
-            link = link[:link.index(nexth2Link)]
-        # only take links having "https" in that
-        for i in link:
-          if "https" in i:  output.append(i)
-        json[soup.find_all('h2')[h2Pos].text].extend(output)
-    return json
-  def extractTable(self):
-    soup = self.openHTMLFile()
-    df = []
-    if len(soup)>0:
-      try:
-        df = pd.read_html(str(soup))
-      except ValueError:
-        df = []
-        print("No tables found in HTML file")
-    return df
-  def mergeTextInJson(self,jsonHTML):
-    try:
-      #cl = cleanText.cleanGenText()
-      htmlText = ""
-      if jsonHTML:
-      #   try:
-      #     for sec, entries in jsonHTML.items():
-      #         for i, entry in enumerate(entries):
-      #             # Only process if it's actually text
-      #             if isinstance(entry, str):
-      #                 if entry.strip():
-      #                     entry, filteredWord = cl.textPreprocessing(entry, keepPeriod=True)
-      #             else:
-      #                 # Skip or convert dicts/lists to string if needed
-      #                 entry = str(entry)
-      #             jsonHTML[sec][i] = entry
-      #             # Add spacing between sentences
-      #             if i - 1 >= 0 and jsonHTML[sec][i - 1] and jsonHTML[sec][i - 1][-1] != ".":
-      #                 htmlText += ". "
-      #             htmlText += entry
-      #         # Add final period if needed
-      #         if entries and isinstance(entries[-1], str) and entries[-1] and entries[-1][-1] != ".":
-      #             htmlText += "."
-      #         htmlText += "\n\n"
-      #   except:
-        htmlText += str(jsonHTML)
-      return htmlText
-    except:
-      print("failed merge text in json")
-      return ""

+# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
+from bs4 import BeautifulSoup
+import requests
+from DefaultPackages import openFile, saveFile
+from NER import cleanText
+import pandas as pd
+from lxml.etree import ParserError, XMLSyntaxError
+import aiohttp
+import asyncio
+class HTML():
+  def __init__(self, htmlFile, htmlLink, htmlContent: str=None):
+    self.htmlLink = htmlLink
+    self.htmlFile = htmlFile
+    self.htmlContent = htmlContent  # NEW: store raw HTML if provided
+  def fetch_crossref_metadata(self, doi):
+    """Fetch metadata from CrossRef API for a given DOI."""
+    try:
+        url = f"https://api.crossref.org/works/{doi}"
+        r = requests.get(url, timeout=10)
+        if r.status_code == 200:
+            return r.json().get("message", {})
+        else:
+            print(f"⚠️ CrossRef fetch failed ({r.status_code}) for DOI: {doi}")
+            return {}
+    except Exception as e:
+        print(f"❌ CrossRef exception: {e}")
+        return {}
+  # def openHTMLFile(self):
+  #   headers = {
+  #       "User-Agent": (
+  #           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+  #           "AppleWebKit/537.36 (KHTML, like Gecko) "
+  #           "Chrome/114.0.0.0 Safari/537.36"
+  #       ),
+  #       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+  #       "Referer": self.htmlLink,
+  #       "Connection": "keep-alive"
+  #   }
+  #   session = requests.Session()
+  #   session.headers.update(headers)
+  #   if self.htmlLink != "None":
+  #       try:
+  #           r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
+  #           if r.status_code != 200:
+  #               print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
+  #               return BeautifulSoup("", 'html.parser')
+  #           soup = BeautifulSoup(r.content, 'html.parser')
+  #       except Exception as e:
+  #           print(f"❌ Exception fetching HTML: {e}")
+  #           return BeautifulSoup("", 'html.parser')
+  #   else:
+  #       with open(self.htmlFile) as fp:
+  #           soup = BeautifulSoup(fp, 'html.parser')
+  #   return soup
+  def openHTMLFile(self):
+      """Return a BeautifulSoup object from cached htmlContent, file, or requests."""
+      # If raw HTML already provided (from async aiohttp), use it directly
+      if self.htmlContent is not None:
+          return BeautifulSoup(self.htmlContent, "html.parser")
+      not_need_domain = ['https://broadinstitute.github.io/picard/',
+              'https://software.broadinstitute.org/gatk/best-practices/',
+              'https://www.ncbi.nlm.nih.gov/genbank/',
+              'https://www.mitomap.org/']
+      if self.htmlLink in not_need_domain:
+        return BeautifulSoup("", 'html.parser')
+      headers = {
+          "User-Agent": (
+              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+              "AppleWebKit/537.36 (KHTML, like Gecko) "
+              "Chrome/114.0.0.0 Safari/537.36"
+          ),
+          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+          "Accept-Language": "en-US,en;q=0.9",
+          "Referer": "https://www.google.com/",
+          #"Referer": self.htmlLink,
+          "Connection": "keep-alive"
+      }
+      session = requests.Session()
+      session.headers.update(headers)
+      try:
+          if self.htmlLink and self.htmlLink != "None":
+              r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
+              if r.status_code != 200 or not r.text.strip():
+                  print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
+                  return BeautifulSoup("", 'html.parser')
+              soup = BeautifulSoup(r.content, 'html.parser')
+          elif self.htmlFile:
+              with open(self.htmlFile, encoding='utf-8') as fp:
+                  soup = BeautifulSoup(fp, 'html.parser')
+      except (ParserError, XMLSyntaxError, OSError) as e:
+          print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
+          return BeautifulSoup("", 'html.parser')
+      except Exception as e:
+          print(f"❌ General exception for {self.htmlLink}: {e}")
+          return BeautifulSoup("", 'html.parser')
+      return soup
+  async def async_fetch_html(self):
+    """Async fetch HTML content with aiohttp."""
+    not_need_domain = [
+        "https://broadinstitute.github.io/picard/",
+        "https://software.broadinstitute.org/gatk/best-practices/",
+        "https://www.ncbi.nlm.nih.gov/genbank/",
+        "https://www.mitomap.org/",
+    ]
+    if self.htmlLink in not_need_domain:
+        return ""  # Skip domains we don't need
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/114.0.0.0 Safari/537.36"
+        ),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Referer": "https://www.google.com/",
+        "Connection": "keep-alive",
+    }
+    try:
+        async with aiohttp.ClientSession(headers=headers) as session:
+            async with session.get(self.htmlLink, timeout=15) as resp:
+                if resp.status != 200:
+                    print(f"❌ HTML GET failed ({resp.status}) — {self.htmlLink}")
+                    return ""
+                return await resp.text()
+    except Exception as e:
+        print(f"❌ Async fetch failed for {self.htmlLink}: {e}")
+        return ""
+  @classmethod
+  async def bulk_fetch(cls, links: list[str]):
+      """Fetch multiple links concurrently, return list of HTML() objects with htmlContent filled."""
+      tasks = [cls("", link).async_fetch_html() for link in links]
+      results = await asyncio.gather(*tasks, return_exceptions=True)
+      out = []
+      for link, content in zip(links, results):
+          if isinstance(content, Exception):
+              print(f"⚠️ Exception while fetching {link}: {content}")
+              out.append(cls("", link, htmlContent=""))
+          else:
+              out.append(cls("", link, htmlContent=content))
+      return out
+  def getText(self):
+    try:
+      soup = self.openHTMLFile()
+      s = soup.find_all("html")
+      text = ""
+      if s:
+        for t in range(len(s)):
+          text = s[t].get_text()
+      cl = cleanText.cleanGenText()
+      text = cl.removeExtraSpaceBetweenWords(text)
+      return text
+    except:
+      print("failed get text from html")
+      return ""
+  async def async_getListSection(self, scienceDirect=None):
+    try:
+        json = {}
+        textJson, textHTML = "", ""
+        # Use preloaded HTML (fast path)
+        soup = self.openHTMLFile()
+        h2_tags = soup.find_all('h2')
+        for idx, h2 in enumerate(h2_tags):
+            section_title = h2.get_text(strip=True)
+            json.setdefault(section_title, [])
+            next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
+            for p in h2.find_all_next("p"):
+                if next_h2 and p == next_h2:
+                    break
+                json[section_title].append(p.get_text(strip=True))
+        # If no sections or explicitly ScienceDirect
+        if scienceDirect is not None or len(json) == 0:
+            print("async fetching ScienceDirect metadata...")
+            api_key = "d0f25e6ae2b275e0d2b68e0e98f68d70"
+            doi = self.htmlLink.split("https://doi.org/")[-1]
+            base_url = f"https://api.elsevier.com/content/article/doi/{doi}"
+            headers = {"Accept": "application/json", "X-ELS-APIKey": api_key}
+            async with aiohttp.ClientSession() as session:
+                async with session.get(base_url, headers=headers, timeout=15) as resp:
+                    if resp.status == 200:
+                        data = await resp.json()
+                        if isinstance(data, dict):
+                            json["fullText"] = data
+        # Merge text
+        textJson = self.mergeTextInJson(json)
+        textHTML = self.getText()
+        return textHTML if len(textHTML) > len(textJson) else textJson
+    except Exception as e:
+        print("❌ async_getListSection failed:", e)
+        return ""
+  def getListSection(self, scienceDirect=None):
+    try:
+        json = {}
+        text = ""
+        textJson, textHTML = "",""
+        if scienceDirect == None:
+          # soup = self.openHTMLFile()
+          # # get list of section
+          # json = {}
+          # for h2Pos in range(len(soup.find_all('h2'))):
+          #   if soup.find_all('h2')[h2Pos].text not in json:
+          #     json[soup.find_all('h2')[h2Pos].text] = []
+          #   if h2Pos + 1 < len(soup.find_all('h2')):
+          #     content = soup.find_all('h2')[h2Pos].find_next("p")
+          #     nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
+          #     while content.text != nexth2Content.text:
+          #       json[soup.find_all('h2')[h2Pos].text].append(content.text)
+          #       content = content.find_next("p")
+          #   else:
+          #     content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
+          #     json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
+            soup = self.openHTMLFile()
+            h2_tags = soup.find_all('h2')
+            json = {}
+            for idx, h2 in enumerate(h2_tags):
+                section_title = h2.get_text(strip=True)
+                json.setdefault(section_title, [])
+                # Get paragraphs until next H2
+                next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
+                for p in h2.find_all_next("p"):
+                    if next_h2 and p == next_h2:
+                        break
+                    json[section_title].append(p.get_text(strip=True))
+          # format
+        '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
+        'Results':[], 'Discussion':[], 'References':[],
+        'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
+        'Additional information':[], 'Electronic supplementary material':[],
+        'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
+        if scienceDirect!= None or len(json)==0:
+          # Replace with your actual Elsevier API key
+          api_key = os.environ["SCIENCE_DIRECT_API"]
+          # ScienceDirect article DOI or PI (Example DOI)
+          doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
+          # Base URL for the Elsevier API
+          base_url = "https://api.elsevier.com/content/article/doi/"
+          # Set headers with API key
+          headers = {
+              "Accept": "application/json",
+              "X-ELS-APIKey": api_key
+          }
+          # Make the API request
+          response = requests.get(base_url + doi, headers=headers)
+    # Check if the request was successful
+          if response.status_code == 200:
+            data = response.json()
+            supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
+            # if "originalText" in list(supp_data.keys()):
+            #   if type(supp_data["originalText"])==str:
+            #     json["originalText"] = [supp_data["originalText"]]
+            #   if type(supp_data["originalText"])==dict:
+            #     json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
+            # else:
+            #   if type(supp_data)==dict:
+            #     for key in supp_data:
+            #       json[key] = [supp_data[key]]
+            if type(data)==dict:
+                json["fullText"] = data
+        textJson = self.mergeTextInJson(json)
+        textHTML = self.getText()
+        if len(textHTML) > len(textJson):
+          text = textHTML
+        else: text = textJson
+        return text #json
+    except:
+        print("failed all")
+        return ""
+  def getReference(self):
+    # get reference to collect more next data
+    ref = []
+    json = self.getListSection()
+    for key in json["References"]:
+      ct = cleanText.cleanGenText(key)
+      cleanText, filteredWord = ct.cleanText()
+      if cleanText not in ref:
+        ref.append(cleanText)
+    return ref
+  def getSupMaterial(self):
+    # check if there is material or not
+    json = {}
+    soup = self.openHTMLFile()
+    for h2Pos in range(len(soup.find_all('h2'))):
+      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
+        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
+        link, output = [],[]
+        if soup.find_all('h2')[h2Pos].text not in json:
+          json[soup.find_all('h2')[h2Pos].text] = []
+        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
+            link.append(l["href"])
+        if h2Pos + 1 < len(soup.find_all('h2')):
+          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
+          if nexth2Link in link:
+            link = link[:link.index(nexth2Link)]
+        # only take links having "https" in that
+        for i in link:
+          if "https" in i:  output.append(i)
+        json[soup.find_all('h2')[h2Pos].text].extend(output)
+    return json
+  def extractTable(self):
+    soup = self.openHTMLFile()
+    df = []
+    if len(soup)>0:
+      try:
+        df = pd.read_html(str(soup))
+      except ValueError:
+        df = []
+        print("No tables found in HTML file")
+    return df
+  def mergeTextInJson(self,jsonHTML):
+    try:
+      #cl = cleanText.cleanGenText()
+      htmlText = ""
+      if jsonHTML:
+      #   try:
+      #     for sec, entries in jsonHTML.items():
+      #         for i, entry in enumerate(entries):
+      #             # Only process if it's actually text
+      #             if isinstance(entry, str):
+      #                 if entry.strip():
+      #                     entry, filteredWord = cl.textPreprocessing(entry, keepPeriod=True)
+      #             else:
+      #                 # Skip or convert dicts/lists to string if needed
+      #                 entry = str(entry)
+      #             jsonHTML[sec][i] = entry
+      #             # Add spacing between sentences
+      #             if i - 1 >= 0 and jsonHTML[sec][i - 1] and jsonHTML[sec][i - 1][-1] != ".":
+      #                 htmlText += ". "
+      #             htmlText += entry
+      #         # Add final period if needed
+      #         if entries and isinstance(entries[-1], str) and entries[-1] and entries[-1][-1] != ".":
+      #             htmlText += "."
+      #         htmlText += "\n\n"
+      #   except:
+        htmlText += str(jsonHTML)
+      return htmlText
+    except:
+      print("failed merge text in json")
+      return ""

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

better_offer.html CHANGED Viewed

@@ -1,201 +1,201 @@
-    <div id="classifier-page">
-      <style>
-        /* Force light mode inside this block only */
-        #classifier-page {
-          background: #ffffff !important;
-          color: #0f172a !important;
-          font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif !important;
-          max-width: 900px !important;
-          margin: 24px auto !important;
-          padding: 28px 20px !important;
-          border-radius: 12px !important;
-          box-shadow: 0 2px 8px rgba(0,0,0,0.05) !important;
-        }
-        /* All text dark */
-        #classifier-page * {
-          color: #0f172a !important;
-        }
-        /* Pills */
-        #classifier-page .pill {
-          display: inline-block !important;
-          padding: 6px 10px !important;
-          border-radius: 999px !important;
-          font-size: 12px !important;
-          font-weight: 500 !important;
-          margin: 2px !important;
-        }
-        #classifier-page .pill-blue { background:#eef2ff !important; color:#3730a3 !important; }
-        #classifier-page .pill-cyan { background:#ecfeff !important; color:#155e75 !important; }
-        #classifier-page .pill-green { background:#f0fdf4 !important; color:#166534 !important; }
-        #classifier-page .pill-orange { background:#fff7ed !important; color:#9a3412 !important; }
-        /* Explicitly restore button background overrides */
-        #classifier-page a {
-          text-decoration: none !important;
-        }
-        #classifier-page a[href*="mtDNALocation"] {
-          background: #111827 !important; /* black */
-          color: #ffffff !important;      /* white text */
-          text-decoration: none !important;
-          padding: 12px 16px !important;
-          border-radius: 10px !important;
-          font-weight: 600 !important;
-          display: inline-block !important;
-        }
-      </style>
-      <!-- Header -->
-      <h1 style="margin:0 0 8px; font-size:32px;">mtDNA Location Classifier</h1>
-      <p style="margin:0 0 16px; font-size:18px; color:#334155;">
-        <strong>AI + Human Intelligence, working together.</strong><br>
-        The tool suggests structured labels fast — you decide which ones to trust and refine.
-      </p>
-      <!-- Badges -->
-      <div style="display:flex; gap:8px; flex-wrap:wrap; margin:12px 0 24px;">
-        <span class="pill pill-blue">84% country accuracy (n=4,934)</span>
-        <span class="pill pill-cyan">92% modern/ancient accuracy (n=4,934)</span>
-        <span class="pill pill-green">Source-backed explanations</span>
-        <span class="pill pill-orange">Report → free credit</span>
-      </div>
-      <hr style="border:none; border-top:1px solid #e2e8f0; margin:24px 0;">
-      <!-- Purpose -->
-      <h2 style="margin:0 0 8px; font-size:22px;">Purpose</h2>
-      <p style="margin:0 0 12px;">
-        Make biological data <strong>reusable</strong> by labeling it better.
-        <br><br>
-        Many GenBank / NCBI samples have <strong>incomplete/missing metadata</strong> (country, sample type, optional ethnicity/specific location).
-        This tool helps researchers generate <strong>clean, structured labels</strong> — ready for papers, datasets, or analysis.
-      <br><br>
-        <em>This is not a black-box AI. It’s a partnership between AI speed and human expertise.</em>
-      </p>
-      <!-- What you get -->
-      <h2 style="margin:24px 0 8px; font-size:22px;">What you get</h2>
-      <ul style="margin:0 0 12px 18px;">
-        <li>AI-powered inference from GenBank accession alone.</li>
-        <li>Country + Sample Type by default; optional labels (e.g. ethnicity &amp; specific location) on request.</li>
-        <li>Transparent outputs: explanations, citations.</li>
-        <li>Excel export; batch upload; multi-ID input.</li>
-        <li><strong>Human-in-the-loop control:</strong> 1-click feedback ensures you decide what counts.</li>
-      </ul>
-      <div style="background:#f8fafc; border:1px solid #e2e8f0; border-radius:12px; padding:14px; margin:16px 0;">
-          <strong>Positioning:</strong> This tool is an <em>accelerator</em>, not a replacement.
-          AI surfaces leads quickly → Human Intelligence validates tricky cases.
-      </div>
-      <hr style="border:none; border-top:1px solid #e2e8f0; margin:24px 0;">
-      <!-- Free tier -->
-      <h2 style="margin:0 0 8px; font-size:22px;">Free tier</h2>
-      <ul style="margin:0 0 12px 18px;">
-        <li><strong>30</strong> free samples (no email).</li>
-        <li>Add email → <strong>+20</strong> bonus samples (<strong>50 total</strong>) and downloads.</li>
-        <li>Not satisfied? Click “Report” → that row doesn’t count, and you get a credit back.</li>
-      </ul>
-      <!-- Pricing -->
-      <h2 style="margin:24px 0 8px; font-size:22px;">Simple pricing</h2>
-      <table style="width:100%; border-collapse:collapse; border:1px solid #e2e8f0; border-radius:10px; overflow:hidden;">
-        <thead>
-          <tr style="background:#f1f5f9;">
-            <th style="text-align:left; padding:10px; font-weight:600;">Plan</th>
-            <th style="text-align:left; padding:10px; font-weight:600;">What’s included</th>
-            <th style="text-align:left; padding:10px; font-weight:600;">Price</th>
-          </tr>
-        </thead>
-        <tbody>
-          <tr>
-            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong>Pay-as-you-go (especially for <a href="#edge_cases" style="color:#1d4ed8; text-decoration:underline;">edge cases</a>)</strong></td>
-            <td style="padding:10px; border-top:1px solid #e2e8f0;">Country + Sample Type, explanations, citations, export, report→credit</td>
-            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong>$0.10 / sample</strong></td>
-          </tr>
-          <tr>
-            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong>Custom labels (optional)</strong></td>
-            <td style="padding:10px; border-top:1px solid #e2e8f0;">Ethnicity, specific location granularity, phenotype, or bespoke fields</td>
-            <td style="padding:10px; border-top:1px solid #e2e8f0;">Quote on request</td>
-          </tr>
-          <tr style="background:#fcfcff;">
-            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong> <a href="#research-supporter" style="color:#1d4ed8; text-decoration:underline;">Research Partner (Supporter) </a></strong></td>
-            <td style="padding:10px; border-top:1px solid #e2e8f0;">~3,000 samples worth of credits + early access, custom label runs, direct feedback channel, recognition</td>
-            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong>$300 / 3 months</strong></td>
-          </tr>
-        </tbody>
-      </table>
-      <p style="margin:10px 0 0; font-size:14px; color:#475569;">
-        <em>Note:</em> For very small sets that you can easily verify manually, we’ll advise you to skip paid runs.
-        We optimize for your outcomes, not usage.
-      </p>
-      <!-- Edge case highlight -->
-      <h2 id="edge_cases" style="margin:24px 0 8px; font-size:22px;">Edge Cases (our specialty)</h2>
-      <p>
-          Some samples are especially hard to label because they don’t have a DOI, PubMed ID, or linked article.
-          Normally these are ignored or left as “unknown.” We call them <strong>edge cases</strong>.
-      </p>
-      <ul>
-          <li>Priced the same as normal runs ($0.10/sample) — no penalty for difficulty</li>
-          <li>Custom labels (e.g. ethnicity, city/province) can also be applied to edge cases on request</li>
-      </ul>
-      <div style="background:#fff7ed; border:1px solid #fed7aa; border-radius:12px; padding:14px; margin:20px 0;">
-        <strong>Why it matters:</strong> One early researcher tested 4,932 samples and found our predictions
-        for some <em>edge cases</em> were more accurate than his manual annotations — even when metadata was missing.
-      </div>
-      <hr>
-      <h2 id="research-supporter" style="margin:24px 0 8px; font-size:22px;">Research Partner Plan (for early supporters)</h2>
-      <p>
-        Designed for researchers running larger studies who want to support ongoing development while staying on budget.
-        Instead of paying strictly per sample, you can join as a <strong>Research Partner</strong>:
-      </p>
-      <ul>
-        <li><strong>$300 flat contribution</strong> (covers ~3,000 samples at $0.10 each, with flexibility on usage)</li>
-        <li>Includes early access to new features and custom labels</li>
-        <li>Direct feedback channel — help shape how the tool evolves</li>
-        <li>Recognition as an early research supporter</li>
-      </ul>
-      <p>
-        <em>This tier was inspired by our very first paying researcher, who contributed $300 to support
-        continued development after testing thousands of samples. It’s ideal if you see the potential
-        and want to support the mission, even if you’re still validating outputs in your workflow.</em>
-      </p>
-      <!-- Who it's for -->
-      <h2 style="margin:24px 0 8px; font-size:22px;">Best for</h2>
-      <ul style="margin:0 0 12px 18px;">
-        <li>Labs cleaning large mtDNA cohorts where manual labeling is slow or inconsistent.</li>
-        <li>Researchers who want fast leads + citations, then validate edge cases themselves.</li>
-        <li>Teams that value transparency and iterative improvement.</li>
-      </ul>
-      <!-- CTA -->
-      <div style="display:flex; gap:12px; flex-wrap:wrap; margin:20px 0;">
-        <a href="https://huggingface.co/spaces/VyLala/mtDNALocation" target="_blank"
-           style="background:#111827; color:#fff; text-decoration:none; padding:12px 16px; border-radius:10px; font-weight:600;">
-          Try the Classifier
-        </a>
-        <a href="mailto:khanhphungvy@gmail.com" target="_blank"
-           style="background:#e2e8f0; color:#0f172a; text-decoration:none; padding:12px 16px; border-radius:10px; font-weight:600;">
-          Bulk / Research Partner request
-        </a>
-      </div>
-      <hr style="border:none; border-top:1px solid #e2e8f0; margin:24px 0;">
-      <!-- Mission -->
-      <h2 style="margin:0 0 8px; font-size:22px;">Mission</h2>
-      <p style="margin:0;">
-        Rebuild trust in genomic metadata—one mtDNA sample at a time—through transparency, citations, and a tight feedback loop with researchers.
-      </p>
-    </div>

+    <div id="classifier-page">
+      <style>
+        /* Force light mode inside this block only */
+        #classifier-page {
+          background: #ffffff !important;
+          color: #0f172a !important;
+          font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif !important;
+          max-width: 900px !important;
+          margin: 24px auto !important;
+          padding: 28px 20px !important;
+          border-radius: 12px !important;
+          box-shadow: 0 2px 8px rgba(0,0,0,0.05) !important;
+        }
+        /* All text dark */
+        #classifier-page * {
+          color: #0f172a !important;
+        }
+        /* Pills */
+        #classifier-page .pill {
+          display: inline-block !important;
+          padding: 6px 10px !important;
+          border-radius: 999px !important;
+          font-size: 12px !important;
+          font-weight: 500 !important;
+          margin: 2px !important;
+        }
+        #classifier-page .pill-blue { background:#eef2ff !important; color:#3730a3 !important; }
+        #classifier-page .pill-cyan { background:#ecfeff !important; color:#155e75 !important; }
+        #classifier-page .pill-green { background:#f0fdf4 !important; color:#166534 !important; }
+        #classifier-page .pill-orange { background:#fff7ed !important; color:#9a3412 !important; }
+        /* Explicitly restore button background overrides */
+        #classifier-page a {
+          text-decoration: none !important;
+        }
+        #classifier-page a[href*="mtDNALocation"] {
+          background: #111827 !important; /* black */
+          color: #ffffff !important;      /* white text */
+          text-decoration: none !important;
+          padding: 12px 16px !important;
+          border-radius: 10px !important;
+          font-weight: 600 !important;
+          display: inline-block !important;
+        }
+      </style>
+      <!-- Header -->
+      <h1 style="margin:0 0 8px; font-size:32px;">mtDNA Location Classifier</h1>
+      <p style="margin:0 0 16px; font-size:18px; color:#334155;">
+        <strong>AI + Human Intelligence, working together.</strong><br>
+        The tool suggests structured labels fast — you decide which ones to trust and refine.
+      </p>
+      <!-- Badges -->
+      <div style="display:flex; gap:8px; flex-wrap:wrap; margin:12px 0 24px;">
+        <span class="pill pill-blue">84% country accuracy (n=4,934)</span>
+        <span class="pill pill-cyan">92% modern/ancient accuracy (n=4,934)</span>
+        <span class="pill pill-green">Source-backed explanations</span>
+        <span class="pill pill-orange">Report → free credit</span>
+      </div>
+      <hr style="border:none; border-top:1px solid #e2e8f0; margin:24px 0;">
+      <!-- Purpose -->
+      <h2 style="margin:0 0 8px; font-size:22px;">Purpose</h2>
+      <p style="margin:0 0 12px;">
+        Make biological data <strong>reusable</strong> by labeling it better.
+        <br><br>
+        Many GenBank / NCBI samples have <strong>incomplete/missing metadata</strong> (country, sample type, optional ethnicity/specific location).
+        This tool helps researchers generate <strong>clean, structured labels</strong> — ready for papers, datasets, or analysis.
+      <br><br>
+        <em>This is not a black-box AI. It’s a partnership between AI speed and human expertise.</em>
+      </p>
+      <!-- What you get -->
+      <h2 style="margin:24px 0 8px; font-size:22px;">What you get</h2>
+      <ul style="margin:0 0 12px 18px;">
+        <li>AI-powered inference from GenBank accession alone.</li>
+        <li>Country + Sample Type by default; optional labels (e.g. ethnicity &amp; specific location) on request.</li>
+        <li>Transparent outputs: explanations, citations.</li>
+        <li>Excel export; batch upload; multi-ID input.</li>
+        <li><strong>Human-in-the-loop control:</strong> 1-click feedback ensures you decide what counts.</li>
+      </ul>
+      <div style="background:#f8fafc; border:1px solid #e2e8f0; border-radius:12px; padding:14px; margin:16px 0;">
+          <strong>Positioning:</strong> This tool is an <em>accelerator</em>, not a replacement.
+          AI surfaces leads quickly → Human Intelligence validates tricky cases.
+      </div>
+      <hr style="border:none; border-top:1px solid #e2e8f0; margin:24px 0;">
+      <!-- Free tier -->
+      <h2 style="margin:0 0 8px; font-size:22px;">Free tier</h2>
+      <ul style="margin:0 0 12px 18px;">
+        <li><strong>30</strong> free samples (no email).</li>
+        <li>Add email → <strong>+20</strong> bonus samples (<strong>50 total</strong>) and downloads.</li>
+        <li>Not satisfied? Click “Report” → that row doesn’t count, and you get a credit back.</li>
+      </ul>
+      <!-- Pricing -->
+      <h2 style="margin:24px 0 8px; font-size:22px;">Simple pricing</h2>
+      <table style="width:100%; border-collapse:collapse; border:1px solid #e2e8f0; border-radius:10px; overflow:hidden;">
+        <thead>
+          <tr style="background:#f1f5f9;">
+            <th style="text-align:left; padding:10px; font-weight:600;">Plan</th>
+            <th style="text-align:left; padding:10px; font-weight:600;">What’s included</th>
+            <th style="text-align:left; padding:10px; font-weight:600;">Price</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong>Pay-as-you-go (especially for <a href="#edge_cases" style="color:#1d4ed8; text-decoration:underline;">edge cases</a>)</strong></td>
+            <td style="padding:10px; border-top:1px solid #e2e8f0;">Country + Sample Type, explanations, citations, export, report→credit</td>
+            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong>$0.10 / sample</strong></td>
+          </tr>
+          <tr>
+            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong>Custom labels (optional)</strong></td>
+            <td style="padding:10px; border-top:1px solid #e2e8f0;">Ethnicity, specific location granularity, phenotype, or bespoke fields</td>
+            <td style="padding:10px; border-top:1px solid #e2e8f0;">Quote on request</td>
+          </tr>
+          <tr style="background:#fcfcff;">
+            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong> <a href="#research-supporter" style="color:#1d4ed8; text-decoration:underline;">Research Partner (Supporter) </a></strong></td>
+            <td style="padding:10px; border-top:1px solid #e2e8f0;">~3,000 samples worth of credits + early access, custom label runs, direct feedback channel, recognition</td>
+            <td style="padding:10px; border-top:1px solid #e2e8f0;"><strong>$300 / 3 months</strong></td>
+          </tr>
+        </tbody>
+      </table>
+      <p style="margin:10px 0 0; font-size:14px; color:#475569;">
+        <em>Note:</em> For very small sets that you can easily verify manually, we’ll advise you to skip paid runs.
+        We optimize for your outcomes, not usage.
+      </p>
+      <!-- Edge case highlight -->
+      <h2 id="edge_cases" style="margin:24px 0 8px; font-size:22px;">Edge Cases (our specialty)</h2>
+      <p>
+          Some samples are especially hard to label because they don’t have a DOI, PubMed ID, or linked article.
+          Normally these are ignored or left as “unknown.” We call them <strong>edge cases</strong>.
+      </p>
+      <ul>
+          <li>Priced the same as normal runs ($0.10/sample) — no penalty for difficulty</li>
+          <li>Custom labels (e.g. ethnicity, city/province) can also be applied to edge cases on request</li>
+      </ul>
+      <div style="background:#fff7ed; border:1px solid #fed7aa; border-radius:12px; padding:14px; margin:20px 0;">
+        <strong>Why it matters:</strong> One early researcher tested 4,932 samples and found our predictions
+        for some <em>edge cases</em> were more accurate than his manual annotations — even when metadata was missing.
+      </div>
+      <hr>
+      <h2 id="research-supporter" style="margin:24px 0 8px; font-size:22px;">Research Partner Plan (for early supporters)</h2>
+      <p>
+        Designed for researchers running larger studies who want to support ongoing development while staying on budget.
+        Instead of paying strictly per sample, you can join as a <strong>Research Partner</strong>:
+      </p>
+      <ul>
+        <li><strong>$300 flat contribution</strong> (covers ~3,000 samples at $0.10 each, with flexibility on usage)</li>
+        <li>Includes early access to new features and custom labels</li>
+        <li>Direct feedback channel — help shape how the tool evolves</li>
+        <li>Recognition as an early research supporter</li>
+      </ul>
+      <p>
+        <em>This tier was inspired by our very first paying researcher, who contributed $300 to support
+        continued development after testing thousands of samples. It’s ideal if you see the potential
+        and want to support the mission, even if you’re still validating outputs in your workflow.</em>
+      </p>
+      <!-- Who it's for -->
+      <h2 style="margin:24px 0 8px; font-size:22px;">Best for</h2>
+      <ul style="margin:0 0 12px 18px;">
+        <li>Labs cleaning large mtDNA cohorts where manual labeling is slow or inconsistent.</li>
+        <li>Researchers who want fast leads + citations, then validate edge cases themselves.</li>
+        <li>Teams that value transparency and iterative improvement.</li>
+      </ul>
+      <!-- CTA -->
+      <div style="display:flex; gap:12px; flex-wrap:wrap; margin:20px 0;">
+        <a href="https://huggingface.co/spaces/VyLala/mtDNALocation" target="_blank"
+           style="background:#111827; color:#fff; text-decoration:none; padding:12px 16px; border-radius:10px; font-weight:600;">
+          Try the Classifier
+        </a>
+        <a href="mailto:khanhphungvy@gmail.com" target="_blank"
+           style="background:#e2e8f0; color:#0f172a; text-decoration:none; padding:12px 16px; border-radius:10px; font-weight:600;">
+          Bulk / Research Partner request
+        </a>
+      </div>
+      <hr style="border:none; border-top:1px solid #e2e8f0; margin:24px 0;">
+      <!-- Mission -->
+      <h2 style="margin:0 0 8px; font-size:22px;">Mission</h2>
+      <p style="margin:0;">
+        Rebuild trust in genomic metadata—one mtDNA sample at a time—through transparency, citations, and a tight feedback loop with researchers.
+      </p>
+    </div>

data_preprocess.py CHANGED Viewed

@@ -1,877 +1,877 @@
-import re
-import os
-#import streamlit as st
-import subprocess
-import re
-from Bio import Entrez
-from docx import Document
-import fitz
-import spacy
-from spacy.cli import download
-from NER.PDF import pdf
-from NER.WordDoc import wordDoc
-from NER.html import extractHTML
-from NER.word2Vec import word2vec
-#from transformers import pipeline
-import urllib.parse, requests
-from pathlib import Path
-import pandas as pd
-import model
-import pipeline
-import tempfile
-import nltk
-nltk.download('punkt_tab')
-def download_excel_file(url, save_path="temp.xlsx"):
-    if "view.officeapps.live.com" in url:
-        parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
-        real_url = urllib.parse.unquote(parsed_url["src"][0])
-        response = requests.get(real_url)
-        with open(save_path, "wb") as f:
-            f.write(response.content)
-        return save_path
-    elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
-        response = requests.get(url)
-        response.raise_for_status()  # Raises error if download fails
-        with open(save_path, "wb") as f:
-            f.write(response.content)
-            print(len(response.content))
-        return save_path
-    else:
-        print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
-        return url
-from pathlib import Path
-import pandas as pd
-def process_file(link, saveFolder):
-    """Returns (file_type, full_path, name) for a given link."""
-    name = Path(link).name
-    ext = Path(name).suffix.lower()
-    file_path = Path(saveFolder) / name
-    # If it's already in saveFolder, update link to local path
-    if file_path.is_file():
-        link = str(file_path)
-    return ext, link, file_path
-import asyncio
-import aiohttp
-_html_cache = {}
-async def async_fetch_html(link: str, timeout: int = 15) -> str:
-    """Fetch HTML asynchronously with caching."""
-    if link in _html_cache:
-        return _html_cache[link]
-    try:
-        async with aiohttp.ClientSession() as session:
-            async with session.get(link, timeout=timeout) as resp:
-                if resp.status != 200:
-                    print(f"⚠️ Failed {link} ({resp.status})")
-                    return ""
-                html_content = await resp.text()
-                _html_cache[link] = html_content
-                return html_content
-    except Exception as e:
-        print(f"❌ async_fetch_html error for {link}: {e}")
-        return ""
-async def ensure_local_file(link: str, saveFolder: str) -> str:
-    """Ensure file is available locally (Drive or web). Returns local path."""
-    name = link.split("/")[-1]
-    local_temp_path = os.path.join(tempfile.gettempdir(), name)
-    if os.path.exists(local_temp_path):
-        return local_temp_path
-    # Try Drive first (blocking → offload)
-    file_id = await asyncio.to_thread(pipeline.find_drive_file, name, saveFolder)
-    if file_id:
-        await asyncio.to_thread(pipeline.download_file_from_drive, name, saveFolder, local_temp_path)
-    else:
-        # Web download asynchronously
-        async with aiohttp.ClientSession() as session:
-            async with session.get(link, timeout=20) as resp:
-                resp.raise_for_status()
-                content = await resp.read()
-                with open(local_temp_path, "wb") as f:
-                    f.write(content)
-        # Upload back to Drive (offload)
-        await asyncio.to_thread(pipeline.upload_file_to_drive, local_temp_path, name, saveFolder)
-    return local_temp_path
-async def async_extract_text(link, saveFolder):
-    try:
-        if link.endswith(".pdf"):
-            local_path = await ensure_local_file(link, saveFolder)
-            return await asyncio.to_thread(lambda: pdf.PDFFast(local_path, saveFolder).extract_text())
-        elif link.endswith((".doc", ".docx")):
-            local_path = await ensure_local_file(link, saveFolder)
-            return await asyncio.to_thread(lambda: wordDoc.WordDocFast(local_path, saveFolder).extractText())
-        elif link.endswith((".xls", ".xlsx")):
-            return ""
-        elif link.startswith("http") or "html" in link:
-            html_content = await async_fetch_html(link)
-            html = extractHTML.HTML(htmlContent=html_content, htmlLink=link, htmlFile="")
-            # If you implement async_getListSection, call it here
-            if hasattr(html, "async_getListSection"):
-                article_text = await html.async_getListSection()
-            else:
-                # fallback: run sync getListSection in a thread
-                article_text = await asyncio.to_thread(html.getListSection)
-            if not article_text:
-                metadata_text = html.fetch_crossref_metadata(link)
-                if metadata_text:
-                    article_text = html.mergeTextInJson(metadata_text)
-            return article_text
-        else:
-            return ""
-    except Exception as e:
-        print(f"❌ async_extract_text failed for {link}: {e}")
-        return ""
-def extract_text(link,saveFolder):
-  try:
-      text = ""
-      name = link.split("/")[-1]
-      print("name: ", name)
-      #file_path = Path(saveFolder) / name
-      local_temp_path = os.path.join(tempfile.gettempdir(), name)
-      print("this is local temp path: ", local_temp_path)
-      if os.path.exists(local_temp_path):
-        input_to_class = local_temp_path
-        print("exist")
-      else:
-        #input_to_class = link  # Let the class handle downloading
-        # 1. Check if file exists in shared Google Drive folder
-        file_id = pipeline.find_drive_file(name, saveFolder)
-        if file_id:
-            print("📥 Downloading from Google Drive...")
-            pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
-        else:
-            print("🌐 Downloading from web link...")
-            response = requests.get(link)
-            with open(local_temp_path, 'wb') as f:
-                f.write(response.content)
-            print("✅ Saved locally.")
-            # 2. Upload to Drive so it's available for later
-            pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
-        input_to_class = local_temp_path
-        print(input_to_class)
-      # pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
-      # pdf
-      if link.endswith(".pdf"):
-        # if file_path.is_file():
-        #   link = saveFolder + "/" + name
-        #   print("File exists.")
-        #p = pdf.PDF(local_temp_path, saveFolder)
-        print("inside pdf and input to class: ", input_to_class)
-        print("save folder in extract text: ", saveFolder)
-        #p = pdf.PDF(input_to_class, saveFolder)
-        #p = pdf.PDF(link,saveFolder)
-        #text = p.extractTextWithPDFReader()
-        #text = p.extractText()
-        p = pdf.PDFFast(input_to_class, saveFolder)
-        text = p.extract_text()
-        print("len text from pdf:")
-        print(len(text))
-        #text_exclude_table = p.extract_text_excluding_tables()
-      # worddoc
-      elif link.endswith(".doc") or link.endswith(".docx"):
-        #d = wordDoc.wordDoc(local_temp_path,saveFolder)
-        # d = wordDoc.wordDoc(input_to_class,saveFolder)
-        # text = d.extractTextByPage()
-        d = wordDoc.WordDocFast(input_to_class, saveFolder)
-        text = d.extractText()
-      # html
-      else:
-        if link.split(".")[-1].lower() not in "xlsx":
-            if "http" in link or "html" in link:
-              print("html link: ", link)
-              html = extractHTML.HTML("",link)
-              text = html.getListSection() # the text already clean
-              print("len text html: ")
-              print(len(text))
-      # Cleanup: delete the local temp file
-      if name:
-          if os.path.exists(local_temp_path):
-            os.remove(local_temp_path)
-            print(f"🧹 Deleted local temp file: {local_temp_path}")
-      print("done extract text")
-  except:
-      text = ""
-  return text
-def extract_table(link,saveFolder):
-  try:
-      table = []
-      name = link.split("/")[-1]
-      #file_path = Path(saveFolder) / name
-      local_temp_path = os.path.join(tempfile.gettempdir(), name)
-      if os.path.exists(local_temp_path):
-        input_to_class = local_temp_path
-        print("exist")
-      else:
-        #input_to_class = link  # Let the class handle downloading
-        # 1. Check if file exists in shared Google Drive folder
-        file_id = pipeline.find_drive_file(name, saveFolder)
-        if file_id:
-            print("📥 Downloading from Google Drive...")
-            pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
-        else:
-            print("🌐 Downloading from web link...")
-            response = requests.get(link)
-            with open(local_temp_path, 'wb') as f:
-                f.write(response.content)
-            print("✅ Saved locally.")
-            # 2. Upload to Drive so it's available for later
-            pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
-        input_to_class = local_temp_path
-        print(input_to_class)
-      #pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
-      # pdf
-      if link.endswith(".pdf"):
-        # if file_path.is_file():
-        #   link = saveFolder + "/" + name
-        #   print("File exists.")
-        #p = pdf.PDF(local_temp_path,saveFolder)
-        p = pdf.PDF(input_to_class,saveFolder)
-        table = p.extractTable()
-      # worddoc
-      elif link.endswith(".doc") or link.endswith(".docx"):
-        #d = wordDoc.wordDoc(local_temp_path,saveFolder)
-        # d = wordDoc.wordDoc(input_to_class,saveFolder)
-        # table = d.extractTableAsList()
-        d = wordDoc.WordDocFast(input_to_class, saveFolder)
-        table = d.extractTableAsList()
-      # excel
-      elif link.split(".")[-1].lower() in "xlsx":
-        # download excel file if it not downloaded yet
-        savePath = saveFolder +"/"+ link.split("/")[-1]
-        excelPath = download_excel_file(link, savePath)
-        try:
-            #xls = pd.ExcelFile(excelPath)
-            xls = pd.ExcelFile(local_temp_path)
-            table_list = []
-            for sheet_name in xls.sheet_names:
-                df = pd.read_excel(xls, sheet_name=sheet_name)
-                cleaned_table = df.fillna("").astype(str).values.tolist()
-                table_list.append(cleaned_table)
-            table = table_list
-        except Exception as e:
-            print("❌ Failed to extract tables from Excel:", e)
-      # html
-      elif "http" in link or "html" in link:
-        html = extractHTML.HTML("",link)
-        table = html.extractTable() # table is a list
-      table = clean_tables_format(table)
-      # Cleanup: delete the local temp file
-      if os.path.exists(local_temp_path):
-        os.remove(local_temp_path)
-        print(f"🧹 Deleted local temp file: {local_temp_path}")
-  except:
-      table = []
-  return table
-def clean_tables_format(tables):
-    """
-    Ensures all tables are in consistent format: List[List[List[str]]]
-    Cleans by:
-    - Removing empty strings and rows
-    - Converting all cells to strings
-    - Handling DataFrames and list-of-lists
-    """
-    cleaned = []
-    if tables:
-      for table in tables:
-          standardized = []
-          # Case 1: Pandas DataFrame
-          if isinstance(table, pd.DataFrame):
-              table = table.fillna("").astype(str).values.tolist()
-          # Case 2: List of Lists
-          if isinstance(table, list) and all(isinstance(row, list) for row in table):
-              for row in table:
-                  filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
-                  if filtered_row:
-                      standardized.append(filtered_row)
-          if standardized:
-              cleaned.append(standardized)
-    return cleaned
-import json
-def normalize_text_for_comparison(s: str) -> str:
-    """
-    Normalizes text for robust comparison by:
-    1. Converting to lowercase.
-    2. Replacing all types of newlines with a single consistent newline (\n).
-    3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
-    4. Stripping leading/trailing whitespace from the entire string.
-    """
-    s = s.lower()
-    s = s.replace('\r\n', '\n') # Handle Windows newlines
-    s = s.replace('\r', '\n')   # Handle Mac classic newlines
-    # Replace sequences of whitespace (including multiple newlines) with a single space
-    # This might be too aggressive if you need to preserve paragraph breaks,
-    # but good for exact word-sequence matching.
-    s = re.sub(r'\s+', ' ', s)
-    return s.strip()
-def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
-    """
-    Merge cleaned text and table into one string for LLM input.
-    - Avoids duplicating tables already in text
-    - Extracts only relevant rows from large tables
-    - Skips or saves oversized tables
-    """
-    import importlib
-    json = importlib.import_module("json")
-    def estimate_tokens(text_str):
-        try:
-            enc = tiktoken.get_encoding(tokenizer)
-            return len(enc.encode(text_str))
-        except:
-            return len(text_str) // 4  # Fallback estimate
-    def is_table_relevant(table, keywords, accession_id=None):
-        flat = " ".join(" ".join(row).lower() for row in table)
-        if accession_id and accession_id.lower() in flat:
-            return True
-        return any(kw.lower() in flat for kw in keywords)
-    preview, preview1 = "",""
-    llm_input = "## Document Text\n" + text.strip() + "\n"
-    clean_text = normalize_text_for_comparison(text)
-    if tables:
-        for idx, table in enumerate(tables):
-          keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
-          if accession_id:  keywords += [accession_id.lower()]
-          if isolate: keywords += [isolate.lower()]
-          if is_table_relevant(table, keywords, accession_id):
-            if len(table) > 0:
-              for tab in table:
-                preview = " ".join(tab) if tab else ""
-                preview1 = "\n".join(tab) if tab else ""
-                clean_preview = normalize_text_for_comparison(preview)
-                clean_preview1 = normalize_text_for_comparison(preview1)
-                if clean_preview not in clean_text:
-                  if clean_preview1 not in clean_text:
-                    table_str = json.dumps([tab], indent=2)
-                    llm_input += f"## Table {idx+1}\n{table_str}\n"
-    return llm_input.strip()
-def preprocess_document(link, saveFolder, accession=None, isolate=None, article_text=None):
-    if article_text:
-      print("article text already available")
-      text = article_text
-    else:
-      try:
-        print("start preprocess and extract text")
-        text = extract_text(link, saveFolder)
-      except: text = ""
-    try:
-      print("extract table start")
-      success, the_output = pipeline.run_with_timeout(extract_table,args=(link,saveFolder),timeout=10)
-      print("Returned from timeout logic")
-      if success:
-        tables = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
-        print("yes succeed for extract table")
-      else:
-        print("not suceed etxract table")
-        tables = []
-      #tables = extract_table(link, saveFolder)
-    except: tables = []
-    if accession: accession = accession
-    if isolate: isolate = isolate
-    try:
-      # print("merge text and table start")
-      # success, the_output = pipeline.run_with_timeout(merge_text_and_tables,kwargs={"text":text,"tables":tables,"accession_id":accession, "isolate":isolate},timeout=30)
-      # print("Returned from timeout logic")
-      # if success:
-      #   final_input = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
-      #   print("yes succeed")
-      # else:
-      #   print("not suceed")
-      print("just merge text and tables")
-      final_input = text + ", ".join(tables)
-      #final_input = pipeline.timeout(merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
-    except:
-      print("no succeed here in preprocess docu")
-      final_input = ""
-    return text, tables, final_input
-def extract_sentences(text):
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    return [s.strip() for s in sentences if s.strip()]
-def is_irrelevant_number_sequence(text):
-    if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
-        return False
-    word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
-    number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
-    total_tokens = len(re.findall(r'\S+', text))
-    if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
-        return True
-    elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
-        return True
-    return False
-def remove_isolated_single_digits(sentence):
-    tokens = sentence.split()
-    filtered_tokens = []
-    for token in tokens:
-        if token == '0' or token == '1':
-            pass
-        else:
-            filtered_tokens.append(token)
-    return ' '.join(filtered_tokens).strip()
-def get_contextual_sentences_BFS(text_content, keyword, depth=2):
-    def extract_codes(sentence):
-    # Match codes like 'A1YU101', 'KM1', 'MO6' — at least 2 letters + numbers
-      return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
-    sentences = extract_sentences(text_content)
-    relevant_sentences = set()
-    initial_keywords = set()
-    # Define a regex to capture codes like A1YU101 or KM1
-    # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
-    code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
-    # Attempt to parse the keyword into its prefix and numerical part using re.search
-    keyword_match = code_pattern.search(keyword)
-    keyword_prefix = None
-    keyword_num = None
-    if keyword_match:
-        keyword_prefix = keyword_match.group(1).lower()
-        keyword_num = int(keyword_match.group(2))
-    for sentence in sentences:
-        sentence_added = False
-        # 1. Check for exact match of the keyword
-        if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
-            relevant_sentences.add(sentence.strip())
-            initial_keywords.add(keyword.lower())
-            sentence_added = True
-        # 2. Check for range patterns (e.g., A1YU101-A1YU137)
-        # The range pattern should be broad enough to capture the full code string within the range.
-        range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
-        range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
-        for r_match in range_matches:
-            start_code_str = r_match.group(1)
-            end_code_str = r_match.group(2)
-            # CRITICAL FIX: Use code_pattern.search for start_match and end_match
-            start_match = code_pattern.search(start_code_str)
-            end_match = code_pattern.search(end_code_str)
-            if keyword_prefix and keyword_num is not None and start_match and end_match:
-                start_prefix = start_match.group(1).lower()
-                end_prefix = end_match.group(1).lower()
-                start_num = int(start_match.group(2))
-                end_num = int(end_match.group(2))
-                # Check if the keyword's prefix matches and its number is within the range
-                if keyword_prefix == start_prefix and \
-                   keyword_prefix == end_prefix and \
-                   start_num <= keyword_num <= end_num:
-                    relevant_sentences.add(sentence.strip())
-                    initial_keywords.add(start_code_str.lower())
-                    initial_keywords.add(end_code_str.lower())
-                    sentence_added = True
-                    break # Only need to find one matching range per sentence
-        # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
-        #    to initial_keywords to ensure graph traversal from related terms.
-        if sentence_added:
-          for word in extract_codes(sentence):
-            initial_keywords.add(word.lower())
-    # Build word_to_sentences mapping for all sentences
-    word_to_sentences = {}
-    for sent in sentences:
-      codes_in_sent = set(extract_codes(sent))
-      for code in codes_in_sent:
-          word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
-    # Build the graph
-    graph = {}
-    for sent in sentences:
-      codes = set(extract_codes(sent))
-      for word1 in codes:
-          word1_lower = word1.lower()
-          graph.setdefault(word1_lower, set())
-          for word2 in codes:
-              word2_lower = word2.lower()
-              if word1_lower != word2_lower:
-                  graph[word1_lower].add(word2_lower)
-    # Perform BFS/graph traversal
-    queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
-    visited_words = set(initial_keywords)
-    while queue:
-        current_word, level = queue.pop(0)
-        if level >= depth:
-            continue
-        relevant_sentences.update(word_to_sentences.get(current_word, []))
-        for neighbor in graph.get(current_word, []):
-            if neighbor not in visited_words:
-                visited_words.add(neighbor)
-                queue.append((neighbor, level + 1))
-    final_sentences = set()
-    for sentence in relevant_sentences:
-        if not is_irrelevant_number_sequence(sentence):
-            processed_sentence = remove_isolated_single_digits(sentence)
-            if processed_sentence:
-                final_sentences.add(processed_sentence)
-    return "\n".join(sorted(list(final_sentences)))
-def get_contextual_sentences_DFS(text_content, keyword, depth=2):
-    sentences = extract_sentences(text_content)
-    # Build word-to-sentences mapping
-    word_to_sentences = {}
-    for sent in sentences:
-        words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
-        for word in words_in_sent:
-            word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
-    # Function to extract codes in a sentence
-    def extract_codes(sentence):
-      # Only codes like 'KSK1', 'MG272794', not pure numbers
-      return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
-    # DFS with priority based on distance to keyword and early stop if country found
-    def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
-        country = "unknown"
-        if current_depth > max_depth:
-            return country, False
-        if current_word not in word_to_sentences:
-            return country, False
-        for sentence in word_to_sentences[current_word]:
-            if sentence == parent_sentence:
-                continue  # avoid reusing the same sentence
-            collected_sentences.add(sentence)
-            #print("current_word:", current_word)
-            small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
-            #print(small_sen)
-            country = model.get_country_from_text(small_sen)
-            #print("small context country:", country)
-            if country.lower() != "unknown":
-                return country, True
-            else:
-                country = model.get_country_from_text(sentence)
-                #print("full sentence country:", country)
-                if country.lower() != "unknown":
-                    return country, True
-            codes_in_sentence = extract_codes(sentence)
-            idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
-            if idx is None:
-                continue
-            sorted_children = sorted(
-                [code for code in codes_in_sentence if code.lower() not in visited_words],
-                key=lambda x: (abs(codes_in_sentence.index(x) - idx),
-                               0 if codes_in_sentence.index(x) > idx else 1)
-            )
-            #print("sorted_children:", sorted_children)
-            for child in sorted_children:
-                child_lower = child.lower()
-                if child_lower not in visited_words:
-                    visited_words.add(child_lower)
-                    country, should_stop = dfs_traverse(
-                        child_lower, current_depth + 1, max_depth,
-                        visited_words, collected_sentences, parent_sentence=sentence
-                    )
-                    if should_stop:
-                        return country, True
-        return country, False
-    # Begin DFS
-    collected_sentences = set()
-    visited_words = set([keyword.lower()])
-    country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
-    # Filter irrelevant sentences
-    final_sentences = set()
-    for sentence in collected_sentences:
-        if not is_irrelevant_number_sequence(sentence):
-            processed = remove_isolated_single_digits(sentence)
-            if processed:
-                final_sentences.add(processed)
-    if not final_sentences:
-      return country, text_content
-    return country, "\n".join(sorted(list(final_sentences)))
-# Helper function for normalizing text for overlap comparison
-def normalize_for_overlap(s: str) -> str:
-    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
-    s = re.sub(r'\s+', ' ', s).strip()
-    return s
-def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
-    if not text1: return text2
-    if not text2: return text1
-    # Case 1: text2 is fully contained in text1 or vice-versa
-    if text2 in text1:
-        return text1
-    if text1 in text2:
-        return text2
-    # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
-    # This is what your function was primarily designed for.
-    # It looks for the overlap at the "junction" of text1 and text2.
-    max_junction_overlap = 0
-    for i in range(min(len(text1), len(text2)), 0, -1):
-        suffix1 = text1[-i:]
-        prefix2 = text2[:i]
-        # Prioritize exact match, then normalized match
-        if suffix1 == prefix2:
-            max_junction_overlap = i
-            break
-        elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
-            max_junction_overlap = i
-            break # Take the first (longest) normalized match
-    if max_junction_overlap > 0:
-        merged_text = text1 + text2[max_junction_overlap:]
-        return re.sub(r'\s+', ' ', merged_text).strip()
-    # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
-    # This addresses your specific test case where the overlap is at the very beginning of both strings.
-    # This is often used when trying to deduplicate content that shares a common start.
-    longest_common_prefix_len = 0
-    min_len = min(len(text1), len(text2))
-    for i in range(min_len):
-        if text1[i] == text2[i]:
-            longest_common_prefix_len = i + 1
-        else:
-            break
-    # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
-    # AND the remaining parts are distinct, then apply this merge.
-    # This is a heuristic and might need fine-tuning.
-    if longest_common_prefix_len > 0 and \
-       text1[longest_common_prefix_len:].strip() and \
-       text2[longest_common_prefix_len:].strip():
-        # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
-        # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
-        # common prefix is "Hi, I am Vy."
-        # Remaining text1: " Nice to meet you."
-        # Remaining text2: " Goodbye Vy."
-        # So we merge common_prefix + remaining_text1 + remaining_text2
-        common_prefix_str = text1[:longest_common_prefix_len]
-        remainder_text1 = text1[longest_common_prefix_len:]
-        remainder_text2 = text2[longest_common_prefix_len:]
-        merged_text = common_prefix_str + remainder_text1 + remainder_text2
-        return re.sub(r'\s+', ' ', merged_text).strip()
-    # If neither specific overlap type is found, just concatenate
-    merged_text = text1 + text2
-    return re.sub(r'\s+', ' ', merged_text).strip()
-from docx import Document
-from pipeline import upload_file_to_drive
-# def save_text_to_docx(text_content: str, file_path: str):
-#     """
-#     Saves a given text string into a .docx file.
-#     Args:
-#         text_content (str): The text string to save.
-#         file_path (str): The full path including the filename where the .docx file will be saved.
-#                          Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
-#     """
-#     try:
-#         document = Document()
-#         # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
-#         for paragraph_text in text_content.split('\n'):
-#             document.add_paragraph(paragraph_text)
-#         document.save(file_path)
-#         print(f"Text successfully saved to '{file_path}'")
-#     except Exception as e:
-#         print(f"Error saving text to docx file: {e}")
-# def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
-#     """
-#     Saves a given text string into a .docx file locally, then uploads to Google Drive.
-#     Args:
-#         text_content (str): The text string to save.
-#         filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
-#         drive_folder_id (str): Google Drive folder ID where to upload the file.
-#     """
-#     try:
-#         # ✅ Save to temporary local path first
-#         print("file name: ", filename)
-#         print("length text content: ", len(text_content))
-#         local_path = os.path.join(tempfile.gettempdir(), filename)
-#         document = Document()
-#         for paragraph_text in text_content.split('\n'):
-#             document.add_paragraph(paragraph_text)
-#         document.save(local_path)
-#         print(f"✅ Text saved locally to: {local_path}")
-#         # ✅ Upload to Drive
-#         pipeline.upload_file_to_drive(local_path, filename, drive_folder_id)
-#         print(f"✅ Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
-#     except Exception as e:
-#         print(f"❌ Error saving or uploading DOCX: {e}")
-def save_text_to_docx(text_content: str, full_local_path: str):
-    document = Document()
-    for paragraph_text in text_content.split('\n'):
-        document.add_paragraph(paragraph_text)
-    document.save(full_local_path)
-    print(f"✅ Saved DOCX locally: {full_local_path}")
-'''2 scenerios:
-- quick look then found then deepdive and directly get location then stop
-- quick look then found then deepdive but not find location then hold the related words then
-look another files iteratively for each related word and find location and stop'''
-def extract_context(text, keyword, window=500):
-    # firstly try accession number
-    code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
-    # Attempt to parse the keyword into its prefix and numerical part using re.search
-    keyword_match = code_pattern.search(keyword)
-    keyword_prefix = None
-    keyword_num = None
-    if keyword_match:
-        keyword_prefix = keyword_match.group(1).lower()
-        keyword_num = int(keyword_match.group(2))
-    text = text.lower()
-    idx = text.find(keyword.lower())
-    if idx == -1:
-      if keyword_prefix:
-        idx = text.find(keyword_prefix)
-      if idx == -1:
-        return "Sample ID not found."
-      return text[max(0, idx-window): idx+window]
-    return text[max(0, idx-window): idx+window]
-def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
-  cache = {}
-  country = "unknown"
-  output = ""
-  tem_output, small_output = "",""
-  keyword_appear = (False,"")
-  keywords = []
-  if isolate: keywords.append(isolate)
-  if accession: keywords.append(accession)
-  for f in filePaths:
-    # scenerio 1: direct location: truncate the context and then use qa model?
-    if keywords:
-      for keyword in keywords:
-        text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
-        if keyword in final_input:
-          context = extract_context(final_input, keyword)
-          # quick look if country already in context and if yes then return
-          country = model.get_country_from_text(context)
-          if country != "unknown":
-            return country, context, final_input
-          else:
-            country = model.get_country_from_text(final_input)
-            if country != "unknown":
-              return country, context, final_input
-            else: # might be cross-ref
-              keyword_appear = (True, f)
-              cache[f] = context
-              small_output = merge_texts_skipping_overlap(output, context) + "\n"
-              chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
-              countryBFS = model.get_country_from_text(chunkBFS)
-              countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
-              output = merge_texts_skipping_overlap(output, final_input)
-              if countryDFS != "unknown" and countryBFS != "unknown":
-                if len(chunkDFS) <= len(chunkBFS):
-                  return countryDFS, chunkDFS, output
-                else:
-                  return countryBFS, chunkBFS, output
-              else:
-                if countryDFS != "unknown":
-                  return countryDFS, chunkDFS, output
-                if countryBFS != "unknown":
-                  return countryBFS, chunkBFS, output
-        else:
-        # scenerio 2:
-          '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
-          but if we look at file 1 first then maybe we can have lookup dict which country
-          such as Thailand as the key and its re'''
-          cache[f] = final_input
-          if keyword_appear[0] == True:
-            for c in cache:
-              if c!=keyword_appear[1]:
-                if cache[c].lower() not in output.lower():
-                  output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
-                  chunkBFS = get_contextual_sentences_BFS(output, keyword)
-                  countryBFS = model.get_country_from_text(chunkBFS)
-                  countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
-                  if countryDFS != "unknown" and countryBFS != "unknown":
-                    if len(chunkDFS) <= len(chunkBFS):
-                      return countryDFS, chunkDFS, output
-                    else:
-                      return countryBFS, chunkBFS, output
-                  else:
-                    if countryDFS != "unknown":
-                      return countryDFS, chunkDFS, output
-                    if countryBFS != "unknown":
-                      return countryBFS, chunkBFS, output
-          else:
-            if cache[f].lower() not in output.lower():
-              output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
-  if len(output) == 0 or keyword_appear[0]==False:
-    for c in cache:
-      if cache[c].lower() not in output.lower():
-        output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
   return country, "", output

+import re
+import os
+#import streamlit as st
+import subprocess
+import re
+from Bio import Entrez
+from docx import Document
+import fitz
+import spacy
+from spacy.cli import download
+from NER.PDF import pdf
+from NER.WordDoc import wordDoc
+from NER.html import extractHTML
+from NER.word2Vec import word2vec
+#from transformers import pipeline
+import urllib.parse, requests
+from pathlib import Path
+import pandas as pd
+import model
+import pipeline
+import tempfile
+import nltk
+nltk.download('punkt_tab')
+def download_excel_file(url, save_path="temp.xlsx"):
+    if "view.officeapps.live.com" in url:
+        parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
+        real_url = urllib.parse.unquote(parsed_url["src"][0])
+        response = requests.get(real_url)
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+        return save_path
+    elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
+        response = requests.get(url)
+        response.raise_for_status()  # Raises error if download fails
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+            print(len(response.content))
+        return save_path
+    else:
+        print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
+        return url
+from pathlib import Path
+import pandas as pd
+def process_file(link, saveFolder):
+    """Returns (file_type, full_path, name) for a given link."""
+    name = Path(link).name
+    ext = Path(name).suffix.lower()
+    file_path = Path(saveFolder) / name
+    # If it's already in saveFolder, update link to local path
+    if file_path.is_file():
+        link = str(file_path)
+    return ext, link, file_path
+import asyncio
+import aiohttp
+_html_cache = {}
+async def async_fetch_html(link: str, timeout: int = 15) -> str:
+    """Fetch HTML asynchronously with caching."""
+    if link in _html_cache:
+        return _html_cache[link]
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(link, timeout=timeout) as resp:
+                if resp.status != 200:
+                    print(f"⚠️ Failed {link} ({resp.status})")
+                    return ""
+                html_content = await resp.text()
+                _html_cache[link] = html_content
+                return html_content
+    except Exception as e:
+        print(f"❌ async_fetch_html error for {link}: {e}")
+        return ""
+async def ensure_local_file(link: str, saveFolder: str) -> str:
+    """Ensure file is available locally (Drive or web). Returns local path."""
+    name = link.split("/")[-1]
+    local_temp_path = os.path.join(tempfile.gettempdir(), name)
+    if os.path.exists(local_temp_path):
+        return local_temp_path
+    # Try Drive first (blocking → offload)
+    file_id = await asyncio.to_thread(pipeline.find_drive_file, name, saveFolder)
+    if file_id:
+        await asyncio.to_thread(pipeline.download_file_from_drive, name, saveFolder, local_temp_path)
+    else:
+        # Web download asynchronously
+        async with aiohttp.ClientSession() as session:
+            async with session.get(link, timeout=20) as resp:
+                resp.raise_for_status()
+                content = await resp.read()
+                with open(local_temp_path, "wb") as f:
+                    f.write(content)
+        # Upload back to Drive (offload)
+        await asyncio.to_thread(pipeline.upload_file_to_drive, local_temp_path, name, saveFolder)
+    return local_temp_path
+async def async_extract_text(link, saveFolder):
+    try:
+        if link.endswith(".pdf"):
+            local_path = await ensure_local_file(link, saveFolder)
+            return await asyncio.to_thread(lambda: pdf.PDFFast(local_path, saveFolder).extract_text())
+        elif link.endswith((".doc", ".docx")):
+            local_path = await ensure_local_file(link, saveFolder)
+            return await asyncio.to_thread(lambda: wordDoc.WordDocFast(local_path, saveFolder).extractText())
+        elif link.endswith((".xls", ".xlsx")):
+            return ""
+        elif link.startswith("http") or "html" in link:
+            html_content = await async_fetch_html(link)
+            html = extractHTML.HTML(htmlContent=html_content, htmlLink=link, htmlFile="")
+            # If you implement async_getListSection, call it here
+            if hasattr(html, "async_getListSection"):
+                article_text = await html.async_getListSection()
+            else:
+                # fallback: run sync getListSection in a thread
+                article_text = await asyncio.to_thread(html.getListSection)
+            if not article_text:
+                metadata_text = html.fetch_crossref_metadata(link)
+                if metadata_text:
+                    article_text = html.mergeTextInJson(metadata_text)
+            return article_text
+        else:
+            return ""
+    except Exception as e:
+        print(f"❌ async_extract_text failed for {link}: {e}")
+        return ""
+def extract_text(link,saveFolder):
+  try:
+      text = ""
+      name = link.split("/")[-1]
+      print("name: ", name)
+      #file_path = Path(saveFolder) / name
+      local_temp_path = os.path.join(tempfile.gettempdir(), name)
+      print("this is local temp path: ", local_temp_path)
+      if os.path.exists(local_temp_path):
+        input_to_class = local_temp_path
+        print("exist")
+      else:
+        #input_to_class = link  # Let the class handle downloading
+        # 1. Check if file exists in shared Google Drive folder
+        file_id = pipeline.find_drive_file(name, saveFolder)
+        if file_id:
+            print("📥 Downloading from Google Drive...")
+            pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
+        else:
+            print("🌐 Downloading from web link...")
+            response = requests.get(link)
+            with open(local_temp_path, 'wb') as f:
+                f.write(response.content)
+            print("✅ Saved locally.")
+            # 2. Upload to Drive so it's available for later
+            pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
+        input_to_class = local_temp_path
+        print(input_to_class)
+      # pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
+      # pdf
+      if link.endswith(".pdf"):
+        # if file_path.is_file():
+        #   link = saveFolder + "/" + name
+        #   print("File exists.")
+        #p = pdf.PDF(local_temp_path, saveFolder)
+        print("inside pdf and input to class: ", input_to_class)
+        print("save folder in extract text: ", saveFolder)
+        #p = pdf.PDF(input_to_class, saveFolder)
+        #p = pdf.PDF(link,saveFolder)
+        #text = p.extractTextWithPDFReader()
+        #text = p.extractText()
+        p = pdf.PDFFast(input_to_class, saveFolder)
+        text = p.extract_text()
+        print("len text from pdf:")
+        print(len(text))
+        #text_exclude_table = p.extract_text_excluding_tables()
+      # worddoc
+      elif link.endswith(".doc") or link.endswith(".docx"):
+        #d = wordDoc.wordDoc(local_temp_path,saveFolder)
+        # d = wordDoc.wordDoc(input_to_class,saveFolder)
+        # text = d.extractTextByPage()
+        d = wordDoc.WordDocFast(input_to_class, saveFolder)
+        text = d.extractText()
+      # html
+      else:
+        if link.split(".")[-1].lower() not in "xlsx":
+            if "http" in link or "html" in link:
+              print("html link: ", link)
+              html = extractHTML.HTML("",link)
+              text = html.getListSection() # the text already clean
+              print("len text html: ")
+              print(len(text))
+      # Cleanup: delete the local temp file
+      if name:
+          if os.path.exists(local_temp_path):
+            os.remove(local_temp_path)
+            print(f"🧹 Deleted local temp file: {local_temp_path}")
+      print("done extract text")
+  except:
+      text = ""
+  return text
+def extract_table(link,saveFolder):
+  try:
+      table = []
+      name = link.split("/")[-1]
+      #file_path = Path(saveFolder) / name
+      local_temp_path = os.path.join(tempfile.gettempdir(), name)
+      if os.path.exists(local_temp_path):
+        input_to_class = local_temp_path
+        print("exist")
+      else:
+        #input_to_class = link  # Let the class handle downloading
+        # 1. Check if file exists in shared Google Drive folder
+        file_id = pipeline.find_drive_file(name, saveFolder)
+        if file_id:
+            print("📥 Downloading from Google Drive...")
+            pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
+        else:
+            print("🌐 Downloading from web link...")
+            response = requests.get(link)
+            with open(local_temp_path, 'wb') as f:
+                f.write(response.content)
+            print("✅ Saved locally.")
+            # 2. Upload to Drive so it's available for later
+            pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
+        input_to_class = local_temp_path
+        print(input_to_class)
+      #pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
+      # pdf
+      if link.endswith(".pdf"):
+        # if file_path.is_file():
+        #   link = saveFolder + "/" + name
+        #   print("File exists.")
+        #p = pdf.PDF(local_temp_path,saveFolder)
+        p = pdf.PDF(input_to_class,saveFolder)
+        table = p.extractTable()
+      # worddoc
+      elif link.endswith(".doc") or link.endswith(".docx"):
+        #d = wordDoc.wordDoc(local_temp_path,saveFolder)
+        # d = wordDoc.wordDoc(input_to_class,saveFolder)
+        # table = d.extractTableAsList()
+        d = wordDoc.WordDocFast(input_to_class, saveFolder)
+        table = d.extractTableAsList()
+      # excel
+      elif link.split(".")[-1].lower() in "xlsx":
+        # download excel file if it not downloaded yet
+        savePath = saveFolder +"/"+ link.split("/")[-1]
+        excelPath = download_excel_file(link, savePath)
+        try:
+            #xls = pd.ExcelFile(excelPath)
+            xls = pd.ExcelFile(local_temp_path)
+            table_list = []
+            for sheet_name in xls.sheet_names:
+                df = pd.read_excel(xls, sheet_name=sheet_name)
+                cleaned_table = df.fillna("").astype(str).values.tolist()
+                table_list.append(cleaned_table)
+            table = table_list
+        except Exception as e:
+            print("❌ Failed to extract tables from Excel:", e)
+      # html
+      elif "http" in link or "html" in link:
+        html = extractHTML.HTML("",link)
+        table = html.extractTable() # table is a list
+      table = clean_tables_format(table)
+      # Cleanup: delete the local temp file
+      if os.path.exists(local_temp_path):
+        os.remove(local_temp_path)
+        print(f"🧹 Deleted local temp file: {local_temp_path}")
+  except:
+      table = []
+  return table
+def clean_tables_format(tables):
+    """
+    Ensures all tables are in consistent format: List[List[List[str]]]
+    Cleans by:
+    - Removing empty strings and rows
+    - Converting all cells to strings
+    - Handling DataFrames and list-of-lists
+    """
+    cleaned = []
+    if tables:
+      for table in tables:
+          standardized = []
+          # Case 1: Pandas DataFrame
+          if isinstance(table, pd.DataFrame):
+              table = table.fillna("").astype(str).values.tolist()
+          # Case 2: List of Lists
+          if isinstance(table, list) and all(isinstance(row, list) for row in table):
+              for row in table:
+                  filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
+                  if filtered_row:
+                      standardized.append(filtered_row)
+          if standardized:
+              cleaned.append(standardized)
+    return cleaned
+import json
+def normalize_text_for_comparison(s: str) -> str:
+    """
+    Normalizes text for robust comparison by:
+    1. Converting to lowercase.
+    2. Replacing all types of newlines with a single consistent newline (\n).
+    3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
+    4. Stripping leading/trailing whitespace from the entire string.
+    """
+    s = s.lower()
+    s = s.replace('\r\n', '\n') # Handle Windows newlines
+    s = s.replace('\r', '\n')   # Handle Mac classic newlines
+    # Replace sequences of whitespace (including multiple newlines) with a single space
+    # This might be too aggressive if you need to preserve paragraph breaks,
+    # but good for exact word-sequence matching.
+    s = re.sub(r'\s+', ' ', s)
+    return s.strip()
+def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
+    """
+    Merge cleaned text and table into one string for LLM input.
+    - Avoids duplicating tables already in text
+    - Extracts only relevant rows from large tables
+    - Skips or saves oversized tables
+    """
+    import importlib
+    json = importlib.import_module("json")
+    def estimate_tokens(text_str):
+        try:
+            enc = tiktoken.get_encoding(tokenizer)
+            return len(enc.encode(text_str))
+        except:
+            return len(text_str) // 4  # Fallback estimate
+    def is_table_relevant(table, keywords, accession_id=None):
+        flat = " ".join(" ".join(row).lower() for row in table)
+        if accession_id and accession_id.lower() in flat:
+            return True
+        return any(kw.lower() in flat for kw in keywords)
+    preview, preview1 = "",""
+    llm_input = "## Document Text\n" + text.strip() + "\n"
+    clean_text = normalize_text_for_comparison(text)
+    if tables:
+        for idx, table in enumerate(tables):
+          keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
+          if accession_id:  keywords += [accession_id.lower()]
+          if isolate: keywords += [isolate.lower()]
+          if is_table_relevant(table, keywords, accession_id):
+            if len(table) > 0:
+              for tab in table:
+                preview = " ".join(tab) if tab else ""
+                preview1 = "\n".join(tab) if tab else ""
+                clean_preview = normalize_text_for_comparison(preview)
+                clean_preview1 = normalize_text_for_comparison(preview1)
+                if clean_preview not in clean_text:
+                  if clean_preview1 not in clean_text:
+                    table_str = json.dumps([tab], indent=2)
+                    llm_input += f"## Table {idx+1}\n{table_str}\n"
+    return llm_input.strip()
+def preprocess_document(link, saveFolder, accession=None, isolate=None, article_text=None):
+    if article_text:
+      print("article text already available")
+      text = article_text
+    else:
+      try:
+        print("start preprocess and extract text")
+        text = extract_text(link, saveFolder)
+      except: text = ""
+    try:
+      print("extract table start")
+      success, the_output = pipeline.run_with_timeout(extract_table,args=(link,saveFolder),timeout=10)
+      print("Returned from timeout logic")
+      if success:
+        tables = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
+        print("yes succeed for extract table")
+      else:
+        print("not suceed etxract table")
+        tables = []
+      #tables = extract_table(link, saveFolder)
+    except: tables = []
+    if accession: accession = accession
+    if isolate: isolate = isolate
+    try:
+      # print("merge text and table start")
+      # success, the_output = pipeline.run_with_timeout(merge_text_and_tables,kwargs={"text":text,"tables":tables,"accession_id":accession, "isolate":isolate},timeout=30)
+      # print("Returned from timeout logic")
+      # if success:
+      #   final_input = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
+      #   print("yes succeed")
+      # else:
+      #   print("not suceed")
+      print("just merge text and tables")
+      final_input = text + ", ".join(tables)
+      #final_input = pipeline.timeout(merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
+    except:
+      print("no succeed here in preprocess docu")
+      final_input = ""
+    return text, tables, final_input
+def extract_sentences(text):
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    return [s.strip() for s in sentences if s.strip()]
+def is_irrelevant_number_sequence(text):
+    if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
+        return False
+    word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
+    number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
+    total_tokens = len(re.findall(r'\S+', text))
+    if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
+        return True
+    elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
+        return True
+    return False
+def remove_isolated_single_digits(sentence):
+    tokens = sentence.split()
+    filtered_tokens = []
+    for token in tokens:
+        if token == '0' or token == '1':
+            pass
+        else:
+            filtered_tokens.append(token)
+    return ' '.join(filtered_tokens).strip()
+def get_contextual_sentences_BFS(text_content, keyword, depth=2):
+    def extract_codes(sentence):
+    # Match codes like 'A1YU101', 'KM1', 'MO6' — at least 2 letters + numbers
+      return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
+    sentences = extract_sentences(text_content)
+    relevant_sentences = set()
+    initial_keywords = set()
+    # Define a regex to capture codes like A1YU101 or KM1
+    # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
+    code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
+    # Attempt to parse the keyword into its prefix and numerical part using re.search
+    keyword_match = code_pattern.search(keyword)
+    keyword_prefix = None
+    keyword_num = None
+    if keyword_match:
+        keyword_prefix = keyword_match.group(1).lower()
+        keyword_num = int(keyword_match.group(2))
+    for sentence in sentences:
+        sentence_added = False
+        # 1. Check for exact match of the keyword
+        if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
+            relevant_sentences.add(sentence.strip())
+            initial_keywords.add(keyword.lower())
+            sentence_added = True
+        # 2. Check for range patterns (e.g., A1YU101-A1YU137)
+        # The range pattern should be broad enough to capture the full code string within the range.
+        range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
+        range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
+        for r_match in range_matches:
+            start_code_str = r_match.group(1)
+            end_code_str = r_match.group(2)
+            # CRITICAL FIX: Use code_pattern.search for start_match and end_match
+            start_match = code_pattern.search(start_code_str)
+            end_match = code_pattern.search(end_code_str)
+            if keyword_prefix and keyword_num is not None and start_match and end_match:
+                start_prefix = start_match.group(1).lower()
+                end_prefix = end_match.group(1).lower()
+                start_num = int(start_match.group(2))
+                end_num = int(end_match.group(2))
+                # Check if the keyword's prefix matches and its number is within the range
+                if keyword_prefix == start_prefix and \
+                   keyword_prefix == end_prefix and \
+                   start_num <= keyword_num <= end_num:
+                    relevant_sentences.add(sentence.strip())
+                    initial_keywords.add(start_code_str.lower())
+                    initial_keywords.add(end_code_str.lower())
+                    sentence_added = True
+                    break # Only need to find one matching range per sentence
+        # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
+        #    to initial_keywords to ensure graph traversal from related terms.
+        if sentence_added:
+          for word in extract_codes(sentence):
+            initial_keywords.add(word.lower())
+    # Build word_to_sentences mapping for all sentences
+    word_to_sentences = {}
+    for sent in sentences:
+      codes_in_sent = set(extract_codes(sent))
+      for code in codes_in_sent:
+          word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
+    # Build the graph
+    graph = {}
+    for sent in sentences:
+      codes = set(extract_codes(sent))
+      for word1 in codes:
+          word1_lower = word1.lower()
+          graph.setdefault(word1_lower, set())
+          for word2 in codes:
+              word2_lower = word2.lower()
+              if word1_lower != word2_lower:
+                  graph[word1_lower].add(word2_lower)
+    # Perform BFS/graph traversal
+    queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
+    visited_words = set(initial_keywords)
+    while queue:
+        current_word, level = queue.pop(0)
+        if level >= depth:
+            continue
+        relevant_sentences.update(word_to_sentences.get(current_word, []))
+        for neighbor in graph.get(current_word, []):
+            if neighbor not in visited_words:
+                visited_words.add(neighbor)
+                queue.append((neighbor, level + 1))
+    final_sentences = set()
+    for sentence in relevant_sentences:
+        if not is_irrelevant_number_sequence(sentence):
+            processed_sentence = remove_isolated_single_digits(sentence)
+            if processed_sentence:
+                final_sentences.add(processed_sentence)
+    return "\n".join(sorted(list(final_sentences)))
+def get_contextual_sentences_DFS(text_content, keyword, depth=2):
+    sentences = extract_sentences(text_content)
+    # Build word-to-sentences mapping
+    word_to_sentences = {}
+    for sent in sentences:
+        words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
+        for word in words_in_sent:
+            word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
+    # Function to extract codes in a sentence
+    def extract_codes(sentence):
+      # Only codes like 'KSK1', 'MG272794', not pure numbers
+      return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
+    # DFS with priority based on distance to keyword and early stop if country found
+    def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
+        country = "unknown"
+        if current_depth > max_depth:
+            return country, False
+        if current_word not in word_to_sentences:
+            return country, False
+        for sentence in word_to_sentences[current_word]:
+            if sentence == parent_sentence:
+                continue  # avoid reusing the same sentence
+            collected_sentences.add(sentence)
+            #print("current_word:", current_word)
+            small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
+            #print(small_sen)
+            country = model.get_country_from_text(small_sen)
+            #print("small context country:", country)
+            if country.lower() != "unknown":
+                return country, True
+            else:
+                country = model.get_country_from_text(sentence)
+                #print("full sentence country:", country)
+                if country.lower() != "unknown":
+                    return country, True
+            codes_in_sentence = extract_codes(sentence)
+            idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
+            if idx is None:
+                continue
+            sorted_children = sorted(
+                [code for code in codes_in_sentence if code.lower() not in visited_words],
+                key=lambda x: (abs(codes_in_sentence.index(x) - idx),
+                               0 if codes_in_sentence.index(x) > idx else 1)
+            )
+            #print("sorted_children:", sorted_children)
+            for child in sorted_children:
+                child_lower = child.lower()
+                if child_lower not in visited_words:
+                    visited_words.add(child_lower)
+                    country, should_stop = dfs_traverse(
+                        child_lower, current_depth + 1, max_depth,
+                        visited_words, collected_sentences, parent_sentence=sentence
+                    )
+                    if should_stop:
+                        return country, True
+        return country, False
+    # Begin DFS
+    collected_sentences = set()
+    visited_words = set([keyword.lower()])
+    country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
+    # Filter irrelevant sentences
+    final_sentences = set()
+    for sentence in collected_sentences:
+        if not is_irrelevant_number_sequence(sentence):
+            processed = remove_isolated_single_digits(sentence)
+            if processed:
+                final_sentences.add(processed)
+    if not final_sentences:
+      return country, text_content
+    return country, "\n".join(sorted(list(final_sentences)))
+# Helper function for normalizing text for overlap comparison
+def normalize_for_overlap(s: str) -> str:
+    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
+    s = re.sub(r'\s+', ' ', s).strip()
+    return s
+def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
+    if not text1: return text2
+    if not text2: return text1
+    # Case 1: text2 is fully contained in text1 or vice-versa
+    if text2 in text1:
+        return text1
+    if text1 in text2:
+        return text2
+    # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
+    # This is what your function was primarily designed for.
+    # It looks for the overlap at the "junction" of text1 and text2.
+    max_junction_overlap = 0
+    for i in range(min(len(text1), len(text2)), 0, -1):
+        suffix1 = text1[-i:]
+        prefix2 = text2[:i]
+        # Prioritize exact match, then normalized match
+        if suffix1 == prefix2:
+            max_junction_overlap = i
+            break
+        elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
+            max_junction_overlap = i
+            break # Take the first (longest) normalized match
+    if max_junction_overlap > 0:
+        merged_text = text1 + text2[max_junction_overlap:]
+        return re.sub(r'\s+', ' ', merged_text).strip()
+    # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
+    # This addresses your specific test case where the overlap is at the very beginning of both strings.
+    # This is often used when trying to deduplicate content that shares a common start.
+    longest_common_prefix_len = 0
+    min_len = min(len(text1), len(text2))
+    for i in range(min_len):
+        if text1[i] == text2[i]:
+            longest_common_prefix_len = i + 1
+        else:
+            break
+    # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
+    # AND the remaining parts are distinct, then apply this merge.
+    # This is a heuristic and might need fine-tuning.
+    if longest_common_prefix_len > 0 and \
+       text1[longest_common_prefix_len:].strip() and \
+       text2[longest_common_prefix_len:].strip():
+        # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
+        # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
+        # common prefix is "Hi, I am Vy."
+        # Remaining text1: " Nice to meet you."
+        # Remaining text2: " Goodbye Vy."
+        # So we merge common_prefix + remaining_text1 + remaining_text2
+        common_prefix_str = text1[:longest_common_prefix_len]
+        remainder_text1 = text1[longest_common_prefix_len:]
+        remainder_text2 = text2[longest_common_prefix_len:]
+        merged_text = common_prefix_str + remainder_text1 + remainder_text2
+        return re.sub(r'\s+', ' ', merged_text).strip()
+    # If neither specific overlap type is found, just concatenate
+    merged_text = text1 + text2
+    return re.sub(r'\s+', ' ', merged_text).strip()
+from docx import Document
+from pipeline import upload_file_to_drive
+# def save_text_to_docx(text_content: str, file_path: str):
+#     """
+#     Saves a given text string into a .docx file.
+#     Args:
+#         text_content (str): The text string to save.
+#         file_path (str): The full path including the filename where the .docx file will be saved.
+#                          Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
+#     """
+#     try:
+#         document = Document()
+#         # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
+#         for paragraph_text in text_content.split('\n'):
+#             document.add_paragraph(paragraph_text)
+#         document.save(file_path)
+#         print(f"Text successfully saved to '{file_path}'")
+#     except Exception as e:
+#         print(f"Error saving text to docx file: {e}")
+# def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
+#     """
+#     Saves a given text string into a .docx file locally, then uploads to Google Drive.
+#     Args:
+#         text_content (str): The text string to save.
+#         filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
+#         drive_folder_id (str): Google Drive folder ID where to upload the file.
+#     """
+#     try:
+#         # ✅ Save to temporary local path first
+#         print("file name: ", filename)
+#         print("length text content: ", len(text_content))
+#         local_path = os.path.join(tempfile.gettempdir(), filename)
+#         document = Document()
+#         for paragraph_text in text_content.split('\n'):
+#             document.add_paragraph(paragraph_text)
+#         document.save(local_path)
+#         print(f"✅ Text saved locally to: {local_path}")
+#         # ✅ Upload to Drive
+#         pipeline.upload_file_to_drive(local_path, filename, drive_folder_id)
+#         print(f"✅ Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
+#     except Exception as e:
+#         print(f"❌ Error saving or uploading DOCX: {e}")
+def save_text_to_docx(text_content: str, full_local_path: str):
+    document = Document()
+    for paragraph_text in text_content.split('\n'):
+        document.add_paragraph(paragraph_text)
+    document.save(full_local_path)
+    print(f"✅ Saved DOCX locally: {full_local_path}")
+'''2 scenerios:
+- quick look then found then deepdive and directly get location then stop
+- quick look then found then deepdive but not find location then hold the related words then
+look another files iteratively for each related word and find location and stop'''
+def extract_context(text, keyword, window=500):
+    # firstly try accession number
+    code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
+    # Attempt to parse the keyword into its prefix and numerical part using re.search
+    keyword_match = code_pattern.search(keyword)
+    keyword_prefix = None
+    keyword_num = None
+    if keyword_match:
+        keyword_prefix = keyword_match.group(1).lower()
+        keyword_num = int(keyword_match.group(2))
+    text = text.lower()
+    idx = text.find(keyword.lower())
+    if idx == -1:
+      if keyword_prefix:
+        idx = text.find(keyword_prefix)
+      if idx == -1:
+        return "Sample ID not found."
+      return text[max(0, idx-window): idx+window]
+    return text[max(0, idx-window): idx+window]
+def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
+  cache = {}
+  country = "unknown"
+  output = ""
+  tem_output, small_output = "",""
+  keyword_appear = (False,"")
+  keywords = []
+  if isolate: keywords.append(isolate)
+  if accession: keywords.append(accession)
+  for f in filePaths:
+    # scenerio 1: direct location: truncate the context and then use qa model?
+    if keywords:
+      for keyword in keywords:
+        text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
+        if keyword in final_input:
+          context = extract_context(final_input, keyword)
+          # quick look if country already in context and if yes then return
+          country = model.get_country_from_text(context)
+          if country != "unknown":
+            return country, context, final_input
+          else:
+            country = model.get_country_from_text(final_input)
+            if country != "unknown":
+              return country, context, final_input
+            else: # might be cross-ref
+              keyword_appear = (True, f)
+              cache[f] = context
+              small_output = merge_texts_skipping_overlap(output, context) + "\n"
+              chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
+              countryBFS = model.get_country_from_text(chunkBFS)
+              countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
+              output = merge_texts_skipping_overlap(output, final_input)
+              if countryDFS != "unknown" and countryBFS != "unknown":
+                if len(chunkDFS) <= len(chunkBFS):
+                  return countryDFS, chunkDFS, output
+                else:
+                  return countryBFS, chunkBFS, output
+              else:
+                if countryDFS != "unknown":
+                  return countryDFS, chunkDFS, output
+                if countryBFS != "unknown":
+                  return countryBFS, chunkBFS, output
+        else:
+        # scenerio 2:
+          '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
+          but if we look at file 1 first then maybe we can have lookup dict which country
+          such as Thailand as the key and its re'''
+          cache[f] = final_input
+          if keyword_appear[0] == True:
+            for c in cache:
+              if c!=keyword_appear[1]:
+                if cache[c].lower() not in output.lower():
+                  output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
+                  chunkBFS = get_contextual_sentences_BFS(output, keyword)
+                  countryBFS = model.get_country_from_text(chunkBFS)
+                  countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
+                  if countryDFS != "unknown" and countryBFS != "unknown":
+                    if len(chunkDFS) <= len(chunkBFS):
+                      return countryDFS, chunkDFS, output
+                    else:
+                      return countryBFS, chunkBFS, output
+                  else:
+                    if countryDFS != "unknown":
+                      return countryDFS, chunkDFS, output
+                    if countryBFS != "unknown":
+                      return countryBFS, chunkBFS, output
+          else:
+            if cache[f].lower() not in output.lower():
+              output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
+  if len(output) == 0 or keyword_appear[0]==False:
+    for c in cache:
+      if cache[c].lower() not in output.lower():
+        output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
   return country, "", output

model.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

mtdna_backend.py CHANGED Viewed

@@ -1,1145 +1,1005 @@
-import gradio as gr
-from collections import Counter
-import csv
-import os
-from functools import lru_cache
-#import app
-from mtdna_classifier import classify_sample_location
-import data_preprocess, model, pipeline
-import subprocess
-import json
-import pandas as pd
-import io
-import re
-import tempfile
-import gspread
-from oauth2client.service_account import ServiceAccountCredentials
-from io import StringIO
-import hashlib
-import threading
-# @lru_cache(maxsize=3600)
-# def classify_sample_location_cached(accession):
-#     return classify_sample_location(accession)
-#@lru_cache(maxsize=3600)
-async def pipeline_classify_sample_location_cached(accession,stop_flag=None, save_df=None):
-    print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
-    print("len of save df: ", len(save_df))
-    return await pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag, save_df=save_df)
-# Count and suggest final location
-# def compute_final_suggested_location(rows):
-#     candidates = [
-#         row.get("Predicted Location", "").strip()
-#         for row in rows
-#         if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
-#     ] + [
-#         row.get("Inferred Region", "").strip()
-#         for row in rows
-#         if row.get("Inferred Region", "").strip().lower() not in  ["", "sample id not found", "unknown"]
-#     ]
-#     if not candidates:
-#         return Counter(), ("Unknown", 0)
-#     # Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
-#     tokens = []
-#     for item in candidates:
-#         # Split by comma, whitespace, and newlines
-#         parts = re.split(r'[\s,]+', item)
-#         tokens.extend(parts)
-#     # Step 2: Clean and normalize tokens
-#     tokens = [word.strip() for word in tokens if word.strip().isalpha()]  # Keep only alphabetic tokens
-#     # Step 3: Count
-#     counts = Counter(tokens)
-#     # Step 4: Get most common
-#     top_location, count = counts.most_common(1)[0]
-#     return counts, (top_location, count)
-# Store feedback (with required fields)
-def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
-    if not answer1.strip() or not answer2.strip():
-        return "⚠️ Please answer both questions before submitting."
-    try:
-        # ✅ Step: Load credentials from Hugging Face secret
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        # Connect to Google Sheet
-        client = gspread.authorize(creds)
-        sheet = client.open("feedback_mtdna").sheet1  # make sure sheet name matches
-        # Append feedback
-        sheet.append_row([accession, answer1, answer2, contact])
-        return "✅ Feedback submitted. Thank you!"
-    except Exception as e:
-        return f"❌ Error submitting feedback: {e}"
-import re
-ACCESSION_REGEX = re.compile(r'^[A-Z]{1,4}_?\d{6}(\.\d+)?$')
-def is_valid_accession(acc):
-    return bool(ACCESSION_REGEX.match(acc))
-# helper function to extract accessions
-def extract_accessions_from_input(file=None, raw_text=""):
-    print(f"RAW TEXT RECEIVED: {raw_text}")
-    accessions, invalid_accessions = [], []
-    seen = set()
-    if file:
-        try:
-            if file.name.endswith(".csv"):
-                df = pd.read_csv(file)
-            elif file.name.endswith(".xlsx"):
-                df = pd.read_excel(file)
-            else:
-                return [], "Unsupported file format. Please upload CSV or Excel."
-            for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
-                if acc not in seen:
-                    if is_valid_accession(acc):
-                        accessions.append(acc)
-                        seen.add(acc)
-                    else:
-                        invalid_accessions.append(acc)
-        except Exception as e:
-            return [],[], f"Failed to read file: {e}"
-    if raw_text:
-        try:
-            text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
-            for acc in text_ids:
-                if acc not in seen:
-                    if is_valid_accession(acc):
-                            accessions.append(acc)
-                            seen.add(acc)
-                    else:
-                        invalid_accessions.append(acc)
-        except Exception as e:
-            return [],[], f"Failed to read file: {e}"
-    return list(accessions), list(invalid_accessions), None
-# ✅ Add a new helper to backend: `filter_unprocessed_accessions()`
-def get_incomplete_accessions(file_path):
-    df = pd.read_excel(file_path)
-    incomplete_accessions = []
-    for _, row in df.iterrows():
-        sample_id = str(row.get("Sample ID", "")).strip()
-        # Skip if no sample ID
-        if not sample_id:
-            continue
-        # Drop the Sample ID and check if the rest is empty
-        other_cols = row.drop(labels=["Sample ID"], errors="ignore")
-        if other_cols.isna().all() or (other_cols.astype(str).str.strip() == "").all():
-            # Extract the accession number from the sample ID using regex
-            match = re.search(r"\b[A-Z]{2,4}\d{4,}", sample_id)
-            if match:
-                incomplete_accessions.append(match.group(0))
-    print(len(incomplete_accessions))
-    return incomplete_accessions
-# GOOGLE_SHEET_NAME = "known_samples"
-# USAGE_DRIVE_FILENAME = "user_usage_log.json"
-def truncate_cell(value, max_len=49000):
-    """Ensure cell content never exceeds Google Sheets 50k char limit."""
-    if not isinstance(value, str):
-        value = str(value)
-    return value[:max_len] + ("... [TRUNCATED]" if len(value) > max_len else "")
-async def summarize_results(accession, stop_flag=None):
-    # Early bail
-    if stop_flag is not None and stop_flag.value:
-        print(f"🛑 Skipping {accession} before starting.")
-        return []
-    # try cache first
-    cached = check_known_output(accession)
-    if cached:
-        print(f"✅ Using cached result for {accession}")
-        return [[
-            cached["Sample ID"] or "unknown",
-            cached["Predicted Country"] or "unknown",
-            cached["Country Explanation"] or "unknown",
-            cached["Predicted Sample Type"] or "unknown",
-            cached["Sample Type Explanation"] or "unknown",
-            cached["Sources"] or "No Links",
-            cached["Time cost"]
-        ]]
-    # only run when nothing in the cache
-    try:
-        print("try gemini pipeline: ",accession)
-        # ✅ Load credentials from Hugging Face secret
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        client = gspread.authorize(creds)
-        spreadsheet = client.open("known_samples")
-        sheet = spreadsheet.sheet1
-        data = sheet.get_all_values()
-        if not data:
-            print("⚠️ Google Sheet 'known_samples' is empty.")
-            return None
-        save_df = pd.DataFrame(data[1:], columns=data[0])
-        print("before pipeline, len of save df: ", len(save_df))
-        outputs = await pipeline_classify_sample_location_cached(accession, stop_flag, save_df)
-        if stop_flag is not None and stop_flag.value:
-            print(f"🛑 Skipped {accession} mid-pipeline.")
-            return []
-        # outputs = {'KU131308': {'isolate':'BRU18',
-        # 'country': {'brunei': ['ncbi',
-        # 'rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
-        # 'sample_type': {'modern':
-        # ['rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
-        # 'query_cost': 9.754999999999999e-05,
-        # 'time_cost': '24.776 seconds',
-        # 'source': ['https://doi.org/10.1007/s00439-015-1620-z',
-        # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf',
-        # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls']}}
-    except Exception as e:
-        return []#, f"Error: {e}", f"Error: {e}", f"Error: {e}"
-    if accession not in outputs:
-        print("no accession in output ", accession)
-        return []#, "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
-    row_score = []
-    rows = []
-    save_rows = []
-    for key in outputs:
-        pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
-        for section, results in outputs[key].items():
-          if section == "country" or section =="sample_type":
-            pred_output = []#"\n".join(list(results.keys()))
-            output_explanation = ""
-            for result, content in results.items():
-              if len(result) == 0:  result = "unknown"
-              if len(content) == 0: output_explanation = "unknown"
-              else:
-                output_explanation += 'Method: ' + "\nMethod: ".join(content)  + "\n"
-              pred_output.append(result)
-            pred_output = "\n".join(pred_output)
-            if section == "country":
-              pred_country, country_explanation = pred_output, output_explanation
-            elif section == "sample_type":
-              pred_sample, sample_explanation = pred_output, output_explanation
-          if outputs[key]["isolate"].lower()!="unknown":
-            label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
-          else: label = key
-        if len(outputs[key]["source"]) == 0:  outputs[key]["source"] = ["No Links"]
-        # row = {
-        #     "Sample ID": label or "unknown",
-        #     "Predicted Country": pred_country or "unknown",
-        #     "Country Explanation": country_explanation or "unknown",
-        #     "Predicted Sample Type":pred_sample or "unknown",
-        #     "Sample Type Explanation":sample_explanation or "unknown",
-        #     "Sources": "\n".join(outputs[key]["source"]) or "No Links",
-        #     "Time cost": outputs[key]["time_cost"]
-        # }
-        row = {
-            "Sample ID": truncate_cell(label or "unknown"),
-            "Predicted Country": truncate_cell(pred_country or "unknown"),
-            "Country Explanation": truncate_cell(country_explanation or "unknown"),
-            "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
-            "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
-            "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
-            "Time cost": truncate_cell(outputs[key]["time_cost"])
-        }
-        #row_score.append(row)
-        rows.append(list(row.values()))
-        # save_row = {
-        #     "Sample ID": label or "unknown",
-        #     "Predicted Country": pred_country or "unknown",
-        #     "Country Explanation": country_explanation or "unknown",
-        #     "Predicted Sample Type":pred_sample or "unknown",
-        #     "Sample Type Explanation":sample_explanation or "unknown",
-        #     "Sources": "\n".join(outputs[key]["source"]) or "No Links",
-        #     "Query_cost": outputs[key]["query_cost"] or "",
-        #     "Time cost": outputs[key]["time_cost"] or "",
-        #     "file_chunk":outputs[key]["file_chunk"] or "",
-        #     "file_all_output":outputs[key]["file_all_output"] or ""
-        # }
-        save_row = {
-            "Sample ID": truncate_cell(label or "unknown"),
-            "Predicted Country": truncate_cell(pred_country or "unknown"),
-            "Country Explanation": truncate_cell(country_explanation or "unknown"),
-            "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
-            "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
-            "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
-            "Query_cost": outputs[key]["query_cost"] or "",
-            "Time cost": outputs[key]["time_cost"] or "",
-            "file_chunk": truncate_cell(outputs[key]["file_chunk"] or ""),
-            "file_all_output": truncate_cell(outputs[key]["file_all_output"] or "")
-        }
-        #row_score.append(row)
-        save_rows.append(list(save_row.values()))
-    # #location_counts, (final_location, count) = compute_final_suggested_location(row_score)
-    # summary_lines = [f"### 🧭 Location Summary:\n"]
-    # summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
-    # summary_lines.append(f"\n**Final Suggested Location:** 🗺️ **{final_location}** (mentioned {count} times)")
-    # summary = "\n".join(summary_lines)
-    # save the new running sample to known excel file
-    # try:
-    #   df_new = pd.DataFrame(save_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Query_cost","Time cost"])
-    #   if os.path.exists(KNOWN_OUTPUT_PATH):
-    #       df_old = pd.read_excel(KNOWN_OUTPUT_PATH)
-    #       df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
-    #   else:
-    #       df_combined = df_new
-    #   df_combined.to_excel(KNOWN_OUTPUT_PATH, index=False)
-    # except Exception as e:
-    #   print(f"⚠️ Failed to save known output: {e}")
-    # try:
-    #     df_new = pd.DataFrame(save_rows, columns=[
-    #         "Sample ID", "Predicted Country", "Country Explanation",
-    #         "Predicted Sample Type", "Sample Type Explanation",
-    #         "Sources", "Query_cost", "Time cost"
-    #     ])
-    #     # ✅ Google Sheets API setup
-    #     creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-    #     scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-    #     creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-    #     client = gspread.authorize(creds)
-    #     # ✅ Open the known_samples sheet
-    #     spreadsheet = client.open("known_samples")  # Replace with your sheet name
-    #     sheet = spreadsheet.sheet1
-    #     # ✅ Read old data
-    #     existing_data = sheet.get_all_values()
-    #     if existing_data:
-    #         df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
-    #     else:
-    #         df_old = pd.DataFrame(columns=df_new.columns)
-    #     # ✅ Combine and remove duplicates
-    #     df_combined = pd.concat([df_old, df_new], ignore_index=True).drop_duplicates(subset="Sample ID")
-    #     # ✅ Clear and write back
-    #     sheet.clear()
-    #     sheet.update([df_combined.columns.values.tolist()] + df_combined.values.tolist())
-    # except Exception as e:
-    #     print(f"⚠️ Failed to save known output to Google Sheets: {e}")
-    # try:
-    #     # Prepare as DataFrame
-    #     df_new = pd.DataFrame(save_rows, columns=[
-    #         "Sample ID", "Predicted Country", "Country Explanation",
-    #         "Predicted Sample Type", "Sample Type Explanation",
-    #         "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
-    #     ])
-    #     # ✅ Setup Google Sheets
-    #     creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-    #     scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-    #     creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-    #     client = gspread.authorize(creds)
-    #     spreadsheet = client.open("known_samples")
-    #     sheet = spreadsheet.sheet1
-    #     # ✅ Read existing data
-    #     existing_data = sheet.get_all_values()
-    #     headers = existing_data[0]
-    #     if existing_data:
-    #         df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
-    #     else:
-    #         df_old = pd.DataFrame(columns=[
-    #             "Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
-    #             "Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
-    #             "Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
-    #         ])
-    #     # ✅ Index by Sample ID
-    #     df_old.set_index("Sample ID", inplace=True)
-    #     df_new.set_index("Sample ID", inplace=True)
-    #     # ✅ Update only matching fields
-    #     update_columns = [
-    #         "Predicted Country", "Predicted Sample Type", "Country Explanation",
-    #         "Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
-    #     ]
-    #     for idx, row in df_new.iterrows():
-    #         if idx not in df_old.index:
-    #             df_old.loc[idx] = ""  # new row, fill empty first
-    #         for col in update_columns:
-    #             if pd.notna(row[col]) and row[col] != "":
-    #                 df_old.at[idx, col] = row[col]
-    #     # ✅ Reset and write back
-    #     EXPECTED_COLUMNS = [
-    #     "Sample ID", "Predicted Country", "Country Explanation",
-    #     "Predicted Sample Type", "Sample Type Explanation",
-    #     "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
-    #     ]
-    #     # Force schema
-    #     for col in EXPECTED_COLUMNS:
-    #         if col not in df_old.columns:
-    #             df_old[col] = ""
-    #     df_old = df_old[EXPECTED_COLUMNS].reset_index(inplace=True)  # reorder + drop unexpected
-    #     # ✅ Safe update
-    #     sheet.clear()
-    #     sheet.update([EXPECTED_COLUMNS] + df_old.astype(str).values.tolist())
-    #     # df_old.reset_index(inplace=True)
-    #     # sheet.clear()
-    #     # sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
-    #     print("✅ Match results saved to known_samples.")
-    # except Exception as e:
-    #     print(f"❌ Failed to update known_samples: {e}")
-    try:
-        # Prepare as DataFrame
-        df_new = pd.DataFrame(save_rows, columns=[
-            "Sample ID", "Predicted Country", "Country Explanation",
-            "Predicted Sample Type", "Sample Type Explanation",
-            "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
-        ])
-        # ✅ Setup Google Sheets
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        client = gspread.authorize(creds)
-        spreadsheet = client.open("known_samples")
-        sheet = spreadsheet.sheet1
-        # ✅ Load existing data
-        existing_data = sheet.get_all_values()
-        headers = existing_data[0]
-        existing_df = pd.DataFrame(existing_data[1:], columns=headers)
-        # ✅ Build lookup: Sample ID → row index
-        id_to_row = {sid: i+2 for i, sid in enumerate(existing_df["Sample ID"])}
-        # +2 because gspread is 1-based and row 1 is headers
-        for _, row in df_new.iterrows():
-            sid = row["Sample ID"]
-            # Row values in correct schema order
-            # row_values = [
-            #     row.get("Sample ID", ""),
-            #     row.get("Predicted Country", ""),
-            #     row.get("Country Explanation", ""),
-            #     row.get("Predicted Sample Type", ""),
-            #     row.get("Sample Type Explanation", ""),
-            #     row.get("Sources", ""),
-            #     row.get("Query_cost", ""),
-            #     row.get("Time cost", ""),
-            #     row.get("file_chunk", ""),
-            #     row.get("file_all_output", "")
-            # ]
-            row_values = [
-                truncate_cell(row.get("Sample ID", "")),
-                truncate_cell(row.get("Predicted Country", "")),
-                truncate_cell(row.get("Country Explanation", "")),
-                truncate_cell(row.get("Predicted Sample Type", "")),
-                truncate_cell(row.get("Sample Type Explanation", "")),
-                truncate_cell(row.get("Sources", "")),
-                truncate_cell(row.get("Query_cost", "")),
-                truncate_cell(row.get("Time cost", "")),
-                truncate_cell(row.get("file_chunk", "")),
-                truncate_cell(row.get("file_all_output", ""))
-            ]
-            if sid in id_to_row:
-                # ✅ Update existing row
-                sheet.update(f"A{id_to_row[sid]}:J{id_to_row[sid]}", [row_values])
-            else:
-                # ✅ Append new row
-                sheet.append_row(row_values)
-        print("✅ Match results safely saved to known_samples.")
-    except Exception as e:
-        print(f"❌ Failed to update known_samples: {e}")
-    return rows#, summary, labelAncient_Modern, explain_label
-# save the batch input in excel file
-# def save_to_excel(all_rows, summary_text, flag_text, filename):
-#     with pd.ExcelWriter(filename) as writer:
-#         # Save table
-#         df_new = pd.DataFrame(all_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
-#         df.to_excel(writer, sheet_name="Detailed Results", index=False)
-#         try:
-#           df_old = pd.read_excel(filename)
-#         except:
-#           df_old = pd.DataFrame([[]], columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
-#         df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
-#         # if os.path.exists(filename):
-#         #   df_old = pd.read_excel(filename)
-#         #   df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
-#         # else:
-#         #     df_combined = df_new
-#         df_combined.to_excel(filename, index=False)
-#           # # Save summary
-#         # summary_df = pd.DataFrame({"Summary": [summary_text]})
-#         # summary_df.to_excel(writer, sheet_name="Summary", index=False)
-#         # # Save flag
-#         # flag_df = pd.DataFrame({"Flag": [flag_text]})
-#         # flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
-# def save_to_excel(all_rows, summary_text, flag_text, filename):
-#     df_new = pd.DataFrame(all_rows, columns=[
-#         "Sample ID", "Predicted Country", "Country Explanation",
-#         "Predicted Sample Type", "Sample Type Explanation",
-#         "Sources", "Time cost"
-#     ])
-#     try:
-#         if os.path.exists(filename):
-#             df_old = pd.read_excel(filename)
-#         else:
-#             df_old = pd.DataFrame(columns=df_new.columns)
-#     except Exception as e:
-#         print(f"⚠️ Warning reading old Excel file: {e}")
-#         df_old = pd.DataFrame(columns=df_new.columns)
-#     #df_combined = pd.concat([df_new, df_old], ignore_index=True).drop_duplicates(subset="Sample ID", keep="first")
-#     df_old.set_index("Sample ID", inplace=True)
-#     df_new.set_index("Sample ID", inplace=True)
-#     df_old.update(df_new)  # <-- update matching rows in df_old with df_new content
-#     df_combined = df_old.reset_index()
-#     try:
-#         df_combined.to_excel(filename, index=False)
-#     except Exception as e:
-#         print(f"❌ Failed to write Excel file {filename}: {e}")
-def save_to_excel(all_rows, summary_text, flag_text, filename, is_resume=False):
-    df_new = pd.DataFrame(all_rows, columns=[
-        "Sample ID", "Predicted Country", "Country Explanation",
-        "Predicted Sample Type", "Sample Type Explanation",
-        "Sources", "Time cost"
-    ])
-    if is_resume and os.path.exists(filename):
-        try:
-            df_old = pd.read_excel(filename)
-        except Exception as e:
-            print(f"⚠️ Warning reading old Excel file: {e}")
-            df_old = pd.DataFrame(columns=df_new.columns)
-        # Set index and update existing rows
-        df_old.set_index("Sample ID", inplace=True)
-        df_new.set_index("Sample ID", inplace=True)
-        df_old.update(df_new)
-        df_combined = df_old.reset_index()
-    else:
-        # If not resuming or file doesn't exist, just use new rows
-        df_combined = df_new
-    try:
-        df_combined.to_excel(filename, index=False)
-    except Exception as e:
-        print(f"❌ Failed to write Excel file {filename}: {e}")
-# save the batch input in JSON file
-def save_to_json(all_rows, summary_text, flag_text, filename):
-    output_dict = {
-        "Detailed_Results": all_rows#,  # <-- make sure this is a plain list, not a DataFrame
-        # "Summary_Text": summary_text,
-        # "Ancient_Modern_Flag": flag_text
-    }
-    # If all_rows is a DataFrame, convert it
-    if isinstance(all_rows, pd.DataFrame):
-        output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
-    with open(filename, "w") as external_file:
-        json.dump(output_dict, external_file, indent=2)
-# save the batch input in Text file
-def save_to_txt(all_rows, summary_text, flag_text, filename):
-    if isinstance(all_rows, pd.DataFrame):
-        detailed_results = all_rows.to_dict(orient="records")
-    output = ""
-    #output += ",".join(list(detailed_results[0].keys())) + "\n\n"
-    output += ",".join([str(k) for k in detailed_results[0].keys()]) + "\n\n"
-    for r in detailed_results:
-      output += ",".join([str(v) for v in r.values()]) + "\n\n"
-    with open(filename, "w") as f:
-        f.write("=== Detailed Results ===\n")
-        f.write(output + "\n")
-        # f.write("\n=== Summary ===\n")
-        # f.write(summary_text + "\n")
-        # f.write("\n=== Ancient/Modern Flag ===\n")
-        # f.write(flag_text + "\n")
-def save_batch_output(all_rows, output_type, summary_text=None, flag_text=None):
-    tmp_dir = tempfile.mkdtemp()
-    #html_table = all_rows.value  # assuming this is stored somewhere
-    # Parse back to DataFrame
-    #all_rows = pd.read_html(all_rows)[0]  # [0] because read_html returns a list
-    all_rows = pd.read_html(StringIO(all_rows))[0]
-    print(all_rows)
-    if output_type == "Excel":
-        file_path = f"{tmp_dir}/batch_output.xlsx"
-        save_to_excel(all_rows, summary_text, flag_text, file_path)
-    elif output_type == "JSON":
-        file_path = f"{tmp_dir}/batch_output.json"
-        save_to_json(all_rows, summary_text, flag_text, file_path)
-        print("Done with JSON")
-    elif output_type == "TXT":
-        file_path = f"{tmp_dir}/batch_output.txt"
-        save_to_txt(all_rows, summary_text, flag_text, file_path)
-    else:
-        return gr.update(visible=False)  # invalid option
-    return gr.update(value=file_path, visible=True)
-# save cost by checking the known outputs
-# def check_known_output(accession):
-#     if not os.path.exists(KNOWN_OUTPUT_PATH):
-#         return None
-#     try:
-#         df = pd.read_excel(KNOWN_OUTPUT_PATH)
-#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
-#         if match:
-#           accession = match.group(0)
-#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
-#         if not matched.empty:
-#             return matched.iloc[0].to_dict()  # Return the cached row
-#     except Exception as e:
-#         print(f"⚠️ Failed to load known samples: {e}")
-#         return None
-# def check_known_output(accession):
-#     try:
-#         # ✅ Load credentials from Hugging Face secret
-#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-#         client = gspread.authorize(creds)
-#         # ✅ Open the known_samples sheet
-#         spreadsheet = client.open("known_samples")  # Replace with your sheet name
-#         sheet = spreadsheet.sheet1
-#         # ✅ Read all rows
-#         data = sheet.get_all_values()
-#         if not data:
-#             return None
-#         df = pd.DataFrame(data[1:], columns=data[0])  # Skip header row
-#         # ✅ Normalize accession pattern
-#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
-#         if match:
-#             accession = match.group(0)
-#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
-#         if not matched.empty:
-#             return matched.iloc[0].to_dict()
-#     except Exception as e:
-#         print(f"⚠️ Failed to load known samples from Google Sheets: {e}")
-#         return None
-# def check_known_output(accession):
-#     print("inside check known output function")
-#     try:
-#         # ✅ Load credentials from Hugging Face secret
-#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-#         client = gspread.authorize(creds)
-#         spreadsheet = client.open("known_samples")
-#         sheet = spreadsheet.sheet1
-#         data = sheet.get_all_values()
-#         if not data:
-#             print("⚠️ Google Sheet 'known_samples' is empty.")
-#             return None
-#         df = pd.DataFrame(data[1:], columns=data[0])
-#         if "Sample ID" not in df.columns:
-#             print("❌ Column 'Sample ID' not found in Google Sheet.")
-#             return None
-#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
-#         if match:
-#             accession = match.group(0)
-#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
-#         if not matched.empty:
-#             #return matched.iloc[0].to_dict()
-#             row = matched.iloc[0]
-#             country = row.get("Predicted Country", "").strip().lower()
-#             sample_type = row.get("Predicted Sample Type", "").strip().lower()
-#             if country and country != "unknown" and sample_type and sample_type != "unknown":
-#                 return row.to_dict()
-#             else:
-#                 print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
-#                 return None
-#         else:
-#             print(f"🔍 Accession {accession} not found in known_samples.")
-#             return None
-#     except Exception as e:
-#         import traceback
-#         print("❌ Exception occurred during check_known_output:")
-#         traceback.print_exc()
-#         return None
-import os
-import re
-import json
-import time
-import gspread
-import pandas as pd
-from oauth2client.service_account import ServiceAccountCredentials
-from gspread.exceptions import APIError
-# --- Global cache ---
-_known_samples_cache = None
-def load_known_samples():
-    """Load the Google Sheet 'known_samples' into a Pandas DataFrame and cache it."""
-    global _known_samples_cache
-    try:
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = [
-            'https://spreadsheets.google.com/feeds',
-            'https://www.googleapis.com/auth/drive'
-        ]
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        client = gspread.authorize(creds)
-        sheet = client.open("known_samples").sheet1
-        data = sheet.get_all_values()
-        if not data:
-            print("⚠️ Google Sheet 'known_samples' is empty.")
-            _known_samples_cache = pd.DataFrame()
-        else:
-            _known_samples_cache = pd.DataFrame(data[1:], columns=data[0])
-            print(f"✅ Cached {_known_samples_cache.shape[0]} rows from known_samples")
-    except APIError as e:
-        print(f"❌ APIError while loading known_samples: {e}")
-        _known_samples_cache = pd.DataFrame()
-    except Exception as e:
-        import traceback
-        print("❌ Exception occurred while loading known_samples:")
-        traceback.print_exc()
-        _known_samples_cache = pd.DataFrame()
-def check_known_output(accession):
-    """Check if an accession exists in the cached 'known_samples' sheet."""
-    global _known_samples_cache
-    print("inside check known output function")
-    try:
-        # Load cache if not already loaded
-        if _known_samples_cache is None:
-            load_known_samples()
-        if _known_samples_cache.empty:
-            print("⚠️ No cached data available.")
-            return None
-        # Extract proper accession format (e.g. AB12345)
-        match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
-        if match:
-            accession = match.group(0)
-        matched = _known_samples_cache[
-            _known_samples_cache["Sample ID"].str.contains(accession, case=False, na=False)
-        ]
-        if not matched.empty:
-            row = matched.iloc[0]
-            country = row.get("Predicted Country", "").strip().lower()
-            sample_type = row.get("Predicted Sample Type", "").strip().lower()
-            if country and country != "unknown" and sample_type and sample_type != "unknown":
-                print(f"🎯 Found {accession} in cache")
-                return row.to_dict()
-            else:
-                print(f"⚠️ Accession {accession} found but country/sample_type unknown or empty.")
-                return None
-        else:
-            print(f"🔍 Accession {accession} not found in cache.")
-            return None
-    except Exception as e:
-        import traceback
-        print("❌ Exception occurred during check_known_output:")
-        traceback.print_exc()
-        return None
-def hash_user_id(user_input):
-    return hashlib.sha256(user_input.encode()).hexdigest()
-# ✅ Load and save usage count
-# def load_user_usage():
-#     if not os.path.exists(USER_USAGE_TRACK_FILE):
-#         return {}
-#     try:
-#         with open(USER_USAGE_TRACK_FILE, "r") as f:
-#             content = f.read().strip()
-#             if not content:
-#                 return {}  # file is empty
-#             return json.loads(content)
-#     except (json.JSONDecodeError, ValueError):
-#         print("⚠️ Warning: user_usage.json is corrupted or invalid. Resetting.")
-#         return {}  # fallback to empty dict
-# def load_user_usage():
-#     try:
-#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-#         client = gspread.authorize(creds)
-#         sheet = client.open("user_usage_log").sheet1
-#         data = sheet.get_all_records()  # Assumes columns: email, usage_count
-#         usage = {}
-#         for row in data:
-#             email = row.get("email", "").strip().lower()
-#             count = int(row.get("usage_count", 0))
-#             if email:
-#                 usage[email] = count
-#         return usage
-#     except Exception as e:
-#         print(f"⚠️ Failed to load user usage from Google Sheets: {e}")
-#         return {}
-# def load_user_usage():
-#     try:
-#         parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
-#         iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)
-#         found = pipeline.find_drive_file("user_usage_log.json", parent_id=iterate3_id)
-#         if not found:
-#             return {}  # not found, start fresh
-#         #file_id = found[0]["id"]
-#         file_id = found
-#         content = pipeline.download_drive_file_content(file_id)
-#         return json.loads(content.strip()) if content.strip() else {}
-#     except Exception as e:
-#         print(f"⚠️ Failed to load user_usage_log.json from Google Drive: {e}")
-#         return {}
-def load_user_usage():
-    try:
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        client = gspread.authorize(creds)
-        sheet = client.open("user_usage_log").sheet1
-        data = sheet.get_all_values()
-        print("data: ", data)
-        print("🧪 Raw header row from sheet:", data[0])
-        print("🧪 Character codes in each header:")
-        for h in data[0]:
-            print([ord(c) for c in h])
-        if not data or len(data) < 2:
-            print("⚠️ Sheet is empty or missing rows.")
-            return {}
-        headers = [h.strip().lower() for h in data[0]]
-        if "email" not in headers or "usage_count" not in headers:
-            print("❌ Header format incorrect. Must have 'email' and 'usage_count'.")
-            return {}
-        permitted_index = headers.index("permitted_samples") if "permitted_samples" in headers else None
-        df = pd.DataFrame(data[1:], columns=headers)
-        usage = {}
-        permitted = {}
-        for _, row in df.iterrows():
-            email = row.get("email", "").strip().lower()
-            try:
-                #count = int(row.get("usage_count", 0))
-                try:
-                    count = int(float(row.get("usage_count", 0)))
-                except Exception:
-                    print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
-                    count = 0
-                if email:
-                    usage[email] = count
-                    if permitted_index is not None:
-                        try:
-                            permitted_count = int(float(row.get("permitted_samples", 50)))
-                            permitted[email] = permitted_count
-                        except:
-                            permitted[email] = 50
-            except ValueError:
-                print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
-        return usage, permitted
-    except Exception as e:
-        print(f"❌ Error in load_user_usage: {e}")
-        return {}, {}
-# def save_user_usage(usage):
-#     with open(USER_USAGE_TRACK_FILE, "w") as f:
-#         json.dump(usage, f, indent=2)
-# def save_user_usage(usage_dict):
-#     try:
-#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-#         client = gspread.authorize(creds)
-#         sheet = client.open("user_usage_log").sheet1
-#         sheet.clear()  # clear old contents first
-#         # Write header + rows
-#         rows = [["email", "usage_count"]] + [[email, count] for email, count in usage_dict.items()]
-#         sheet.update(rows)
-#     except Exception as e:
-#         print(f"❌ Failed to save user usage to Google Sheets: {e}")
-# def save_user_usage(usage_dict):
-#     try:
-#         parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
-#         iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)
-#         import tempfile
-#         tmp_path = os.path.join(tempfile.gettempdir(), "user_usage_log.json")
-#         print("💾 Saving this usage dict:", usage_dict)
-#         with open(tmp_path, "w") as f:
-#             json.dump(usage_dict, f, indent=2)
-#         pipeline.upload_file_to_drive(tmp_path, "user_usage_log.json", iterate3_id)
-#     except Exception as e:
-#         print(f"❌ Failed to save user_usage_log.json to Google Drive: {e}")
-# def save_user_usage(usage_dict):
-#     try:
-#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-#         client = gspread.authorize(creds)
-#         spreadsheet = client.open("user_usage_log")
-#         sheet = spreadsheet.sheet1
-#         # Step 1: Convert new usage to DataFrame
-#         df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
-#         df_new["email"] = df_new["email"].str.strip().str.lower()
-#         # Step 2: Load existing data
-#         existing_data = sheet.get_all_values()
-#         print("🧪 Sheet existing_data:", existing_data)
-#         # Try to load old data
-#         if existing_data and len(existing_data[0]) >= 1:
-#             df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
-#             # Fix missing columns
-#             if "email" not in df_old.columns:
-#                 df_old["email"] = ""
-#             if "usage_count" not in df_old.columns:
-#                 df_old["usage_count"] = 0
-#             df_old["email"] = df_old["email"].str.strip().str.lower()
-#             df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
-#         else:
-#             df_old = pd.DataFrame(columns=["email", "usage_count"])
-#         # Step 3: Merge
-#         df_combined = pd.concat([df_old, df_new], ignore_index=True)
-#         df_combined = df_combined.groupby("email", as_index=False).sum()
-#         # Step 4: Write back
-#         sheet.clear()
-#         sheet.update([df_combined.columns.tolist()] + df_combined.astype(str).values.tolist())
-#         print("✅ Saved user usage to user_usage_log sheet.")
-#     except Exception as e:
-#         print(f"❌ Failed to save user usage to Google Sheets: {e}")
-def save_user_usage(usage_dict):
-    try:
-        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
-        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
-        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
-        client = gspread.authorize(creds)
-        spreadsheet = client.open("user_usage_log")
-        sheet = spreadsheet.sheet1
-        # Build new df
-        df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
-        df_new["email"] = df_new["email"].str.strip().str.lower()
-        df_new["usage_count"] = pd.to_numeric(df_new["usage_count"], errors="coerce").fillna(0).astype(int)
-        # Read existing data
-        existing_data = sheet.get_all_values()
-        if existing_data and len(existing_data[0]) >= 2:
-            df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
-            df_old["email"] = df_old["email"].str.strip().str.lower()
-            df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
-        else:
-            df_old = pd.DataFrame(columns=["email", "usage_count"])
-        # ✅ Overwrite specific emails only
-        df_old = df_old.set_index("email")
-        for email, count in usage_dict.items():
-            email = email.strip().lower()
-            df_old.loc[email, "usage_count"] = count
-        df_old = df_old.reset_index()
-        # Save
-        sheet.clear()
-        sheet.update([df_old.columns.tolist()] + df_old.astype(str).values.tolist())
-        print("✅ Saved user usage to user_usage_log sheet.")
-    except Exception as e:
-        print(f"❌ Failed to save user usage to Google Sheets: {e}")
-# def increment_usage(user_id, num_samples=1):
-#     usage = load_user_usage()
-#     if user_id not in usage:
-#         usage[user_id] = 0
-#     usage[user_id] += num_samples
-#     save_user_usage(usage)
-#     return usage[user_id]
-# def increment_usage(email: str, count: int):
-#     usage = load_user_usage()
-#     email_key = email.strip().lower()
-#     usage[email_key] = usage.get(email_key, 0) + count
-#     save_user_usage(usage)
-#     return usage[email_key]
-def increment_usage(email: str, count: int = 1):
-    usage, permitted = load_user_usage()
-    email_key = email.strip().lower()
-    #usage[email_key] = usage.get(email_key, 0) + count
-    current = usage.get(email_key, 0)
-    new_value = current + count
-    max_allowed = permitted.get(email_key) or 50
-    usage[email_key] = max(current, new_value)  # ✅ Prevent overwrite with lower
-    print(f"🧪 increment_usage saving: {email_key=} {current=} + {count=} => {usage[email_key]=}")
-    print("max allow is: ", max_allowed)
-    save_user_usage(usage)
-    return usage[email_key], max_allowed
-# run the batch
-def summarize_batch(file=None, raw_text="", resume_file=None, user_email="",
-                    stop_flag=None, output_file_path=None,
-                    limited_acc=50, yield_callback=None):
-    if user_email:
-      limited_acc += 10
-    accessions, error = extract_accessions_from_input(file, raw_text)
-    if error:
-        #return [], "", "", f"Error: {error}"
-        return [], f"Error: {error}", 0, "", ""
-    if resume_file:
-      accessions = get_incomplete_accessions(resume_file)
-    tmp_dir = tempfile.mkdtemp()
-    if not output_file_path:
-      if resume_file:
-        output_file_path = os.path.join(tmp_dir, resume_file)
-      else:
-        output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
-    all_rows = []
-    # all_summaries = []
-    # all_flags = []
-    progress_lines = []
-    warning = ""
-    if len(accessions) > limited_acc:
-      accessions = accessions[:limited_acc]
-      warning = f"Your number of accessions is more than the {limited_acc}, only handle first {limited_acc} accessions"
-    for i, acc in enumerate(accessions):
-        if stop_flag and stop_flag.value:
-            line = f"🛑 Stopped at {acc} ({i+1}/{len(accessions)})"
-            progress_lines.append(line)
-            if yield_callback:
-              yield_callback(line)
-            print("🛑 User requested stop.")
-            break
-        print(f"[{i+1}/{len(accessions)}] Processing {acc}")
-        try:
-            # rows, summary, label, explain = summarize_results(acc)
-            rows = summarize_results(acc)
-            all_rows.extend(rows)
-            # all_summaries.append(f"**{acc}**\n{summary}")
-            # all_flags.append(f"**{acc}**\n### 🏺 Ancient/Modern Flag\n**{label}**\n\n_Explanation:_ {explain}")
-            #save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path)
-            save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path, is_resume=bool(resume_file))
-            line = f"✅ Processed {acc} ({i+1}/{len(accessions)})"
-            progress_lines.append(line)
-            if yield_callback:
-              yield_callback(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
-        except Exception as e:
-            print(f"❌ Failed to process {acc}: {e}")
-            continue
-            #all_summaries.append(f"**{acc}**: Failed - {e}")
-        #progress_lines.append(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
-        limited_acc -= 1
-    """for row in all_rows:
-          source_column = row[2]  # Assuming the "Source" is in the 3rd column (index 2)
-          if source_column.startswith("http"):  # Check if the source is a URL
-              # Wrap it with HTML anchor tags to make it clickable
-              row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
-    if not warning:
-      warning = f"You only have {limited_acc} left"
-    if user_email.strip():
-        user_hash = hash_user_id(user_email)
-        total_queries = increment_usage(user_hash, len(all_rows))
-    else:
-        total_queries = 0
-    yield_callback("✅ Finished!")
-    # summary_text = "\n\n---\n\n".join(all_summaries)
-    # flag_text = "\n\n---\n\n".join(all_flags)
-    #return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)
-    #return all_rows, gr.update(visible=True), gr.update(visible=False)
     return all_rows, output_file_path, total_queries, "\n".join(progress_lines), warning

+import gradio as gr
+from collections import Counter
+import csv
+import os
+from functools import lru_cache
+#import app
+from mtdna_classifier import classify_sample_location
+import data_preprocess, model, pipeline
+import subprocess
+import json
+import pandas as pd
+import io
+import re
+import tempfile
+import gspread
+from oauth2client.service_account import ServiceAccountCredentials
+from io import StringIO
+import hashlib
+import threading
+# @lru_cache(maxsize=3600)
+# def classify_sample_location_cached(accession):
+#     return classify_sample_location(accession)
+#@lru_cache(maxsize=3600)
+async def pipeline_classify_sample_location_cached(accession,stop_flag=None, save_df=None, niche_cases=None):
+    print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
+    print("len of save df: ", len(save_df))
+    if niche_cases: niche_cases=niche_cases.split(", ")
+    print("niche case in mtdna_backend.pipeline: ", niche_cases)
+    return await pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag, save_df=save_df, niche_cases=niche_cases)
+# Count and suggest final location
+# def compute_final_suggested_location(rows):
+#     candidates = [
+#         row.get("Predicted Location", "").strip()
+#         for row in rows
+#         if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
+#     ] + [
+#         row.get("Inferred Region", "").strip()
+#         for row in rows
+#         if row.get("Inferred Region", "").strip().lower() not in  ["", "sample id not found", "unknown"]
+#     ]
+#     if not candidates:
+#         return Counter(), ("Unknown", 0)
+#     # Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
+#     tokens = []
+#     for item in candidates:
+#         # Split by comma, whitespace, and newlines
+#         parts = re.split(r'[\s,]+', item)
+#         tokens.extend(parts)
+#     # Step 2: Clean and normalize tokens
+#     tokens = [word.strip() for word in tokens if word.strip().isalpha()]  # Keep only alphabetic tokens
+#     # Step 3: Count
+#     counts = Counter(tokens)
+#     # Step 4: Get most common
+#     top_location, count = counts.most_common(1)[0]
+#     return counts, (top_location, count)
+# Store feedback (with required fields)
+def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
+    if not answer1.strip() or not answer2.strip():
+        return "⚠️ Please answer both questions before submitting."
+    try:
+        # ✅ Step: Load credentials from Hugging Face secret
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        # Connect to Google Sheet
+        client = gspread.authorize(creds)
+        sheet = client.open("feedback_mtdna").sheet1  # make sure sheet name matches
+        # Append feedback
+        sheet.append_row([accession, answer1, answer2, contact])
+        return "✅ Feedback submitted. Thank you!"
+    except Exception as e:
+        return f"❌ Error submitting feedback: {e}"
+import re
+ACCESSION_REGEX = re.compile(r'^[A-Z]{1,4}_?\d{6}(\.\d+)?$')
+def is_valid_accession(acc):
+    return bool(ACCESSION_REGEX.match(acc))
+# helper function to extract accessions
+def extract_accessions_from_input(file=None, raw_text=""):
+    print(f"RAW TEXT RECEIVED: {raw_text}")
+    accessions, invalid_accessions = [], []
+    seen = set()
+    if file:
+        try:
+            if file.name.endswith(".csv"):
+                df = pd.read_csv(file)
+            elif file.name.endswith(".xlsx"):
+                df = pd.read_excel(file)
+            else:
+                return [], "Unsupported file format. Please upload CSV or Excel."
+            for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
+                if acc not in seen:
+                    if is_valid_accession(acc):
+                        accessions.append(acc)
+                        seen.add(acc)
+                    else:
+                        invalid_accessions.append(acc)
+        except Exception as e:
+            return [],[], f"Failed to read file: {e}"
+    if raw_text:
+        try:
+            text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
+            for acc in text_ids:
+                if acc not in seen:
+                    if is_valid_accession(acc):
+                            accessions.append(acc)
+                            seen.add(acc)
+                    else:
+                        invalid_accessions.append(acc)
+        except Exception as e:
+            return [],[], f"Failed to read file: {e}"
+    return list(accessions), list(invalid_accessions), None
+# ✅ Add a new helper to backend: `filter_unprocessed_accessions()`
+def get_incomplete_accessions(file_path):
+    df = pd.read_excel(file_path)
+    incomplete_accessions = []
+    for _, row in df.iterrows():
+        sample_id = str(row.get("Sample ID", "")).strip()
+        # Skip if no sample ID
+        if not sample_id:
+            continue
+        # Drop the Sample ID and check if the rest is empty
+        other_cols = row.drop(labels=["Sample ID"], errors="ignore")
+        if other_cols.isna().all() or (other_cols.astype(str).str.strip() == "").all():
+            # Extract the accession number from the sample ID using regex
+            match = re.search(r"\b[A-Z]{2,4}\d{4,}", sample_id)
+            if match:
+                incomplete_accessions.append(match.group(0))
+    print(len(incomplete_accessions))
+    return incomplete_accessions
+# GOOGLE_SHEET_NAME = "known_samples"
+# USAGE_DRIVE_FILENAME = "user_usage_log.json"
+def truncate_cell(value, max_len=49000):
+    """Ensure cell content never exceeds Google Sheets 50k char limit."""
+    if not isinstance(value, str):
+        value = str(value)
+    return value[:max_len] + ("... [TRUNCATED]" if len(value) > max_len else "")
+async def summarize_results(accession, stop_flag=None, niche_cases=None):
+    # Early bail
+    if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Skipping {accession} before starting.")
+        return []
+    # try cache first
+    print("niche case in sum_result: ", niche_cases)
+    cached = check_known_output(accession, niche_cases)
+    if cached:
+        print(f"✅ Using cached result for {accession}")
+        return [[
+            cached["Sample ID"] or "unknown",
+            cached["Predicted Country"] or "unknown",
+            cached["Country Explanation"] or "unknown",
+            cached["Predicted Sample Type"] or "unknown",
+            cached["Sample Type Explanation"] or "unknown",
+            cached["Sources"] or "No Links",
+            cached["Time cost"]
+        ]]
+    # only run when nothing in the cache
+    try:
+        print("try gemini pipeline: ",accession)
+        # ✅ Load credentials from Hugging Face secret
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        spreadsheet = client.open("known_samples")
+        sheet = spreadsheet.sheet1
+        data = sheet.get_all_values()
+        if not data:
+            print("⚠️ Google Sheet 'known_samples' is empty.")
+            return None
+        save_df = pd.DataFrame(data[1:], columns=data[0])
+        print("before pipeline, len of save df: ", len(save_df))
+        if niche_cases: niche_cases = ", ".join(niche_cases)
+        print("this is niche case inside summarize result: ", niche_cases)
+        outputs = await pipeline_classify_sample_location_cached(accession, stop_flag, save_df, niche_cases)
+        # outputs = {"KU131308":{"isolate":"BRU18",
+        #                       "country":{"brunei":['ncbi','rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples...']},
+        #                        "sample_type":{"modern":['rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples']},
+        #                        "query_cost":9.754999999999999e-05,
+        #                        "time_cost":'24.776 seconds',
+        #                        "source":['https://doi.org/10.1007/s00439-015-1620-z'],
+        #                         "file_chunk":"filechunk",
+        #                        "file_all_output":"fileoutput",
+        #                       'specific location':{'brunei':["some explain"]}}}
+        if stop_flag is not None and stop_flag.value:
+            print(f"🛑 Skipped {accession} mid-pipeline.")
+            return []
+        # outputs = {'KU131308': {'isolate':'BRU18',
+        # 'country': {'brunei': ['ncbi',
+        # 'rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
+        # 'sample_type': {'modern':
+        # ['rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
+        # 'query_cost': 9.754999999999999e-05,
+        # 'time_cost': '24.776 seconds',
+        # 'source': ['https://doi.org/10.1007/s00439-015-1620-z',
+        # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf',
+        # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls']}}
+    except Exception as e:
+        return []#, f"Error: {e}", f"Error: {e}", f"Error: {e}"
+    if accession not in outputs:
+        print("no accession in output ", accession)
+        return []#, "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
+    row_score = []
+    rows = []
+    save_rows = []
+    for key in outputs:
+        pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
+        checked_sections = ["country", "sample_type"]
+        niche_cases = niche_cases.split(", ")
+        if niche_cases: checked_sections += niche_cases
+        print("checked sections: ", checked_sections)
+        for section, results in outputs[key].items():
+            pred_output = []#"\n".join(list(results.keys()))
+            output_explanation = ""
+            print(section, results)
+            if section not in checked_sections: continue
+            for result, content in results.items():
+              if len(result) == 0:  result = "unknown"
+              if len(content) == 0: output_explanation = "unknown"
+              else:
+                output_explanation += 'Method: ' + "\nMethod: ".join(content)  + "\n"
+              pred_output.append(result)
+            pred_output = "\n".join(pred_output)
+            if section == "country":
+              pred_country, country_explanation = pred_output, output_explanation
+            elif section == "sample_type":
+              pred_sample, sample_explanation = pred_output, output_explanation
+            else:
+              pred_niche, niche_explanation = pred_output, output_explanation
+        if outputs[key]["isolate"].lower()!="unknown":
+            label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
+        else: label = key
+        if len(outputs[key]["source"]) == 0:  outputs[key]["source"] = ["No Links"]
+        if niche_cases:
+            row = {
+                "Sample ID": truncate_cell(label or "unknown"),
+                "Predicted Country": truncate_cell(pred_country or "unknown"),
+                "Country Explanation": truncate_cell(country_explanation or "unknown"),
+                "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
+                "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
+                "Predicted " + niche_cases[0]: truncate_cell(pred_niche or "unknown"),
+                niche_cases[0] + " Explanation": truncate_cell(niche_explanation or "unknown"),
+                "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
+                "Time cost": truncate_cell(outputs[key]["time_cost"])
+            }
+            #row_score.append(row)
+            # rows.append(list(row.values()))
+            rows.append(row)
+            save_row = {
+            "Sample ID": truncate_cell(label or "unknown"),
+            "Predicted Country": truncate_cell(pred_country or "unknown"),
+            "Country Explanation": truncate_cell(country_explanation or "unknown"),
+            "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
+            "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
+            "Predicted " + niche_cases[0]: truncate_cell(pred_niche or "unknown"),
+            niche_cases[0] + " Explanation": truncate_cell(niche_explanation or "unknown"),
+            "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
+            "Query_cost": outputs[key]["query_cost"] or "",
+            "Time cost": outputs[key]["time_cost"] or "",
+            "file_chunk": truncate_cell(outputs[key]["file_chunk"] or ""),
+            "file_all_output": truncate_cell(outputs[key]["file_all_output"] or "")
+            }
+            #row_score.append(row)
+            #save_rows.append(list(save_row.values()))
+            save_rows.append(save_row)
+        else:
+            row = {
+                "Sample ID": truncate_cell(label or "unknown"),
+                "Predicted Country": truncate_cell(pred_country or "unknown"),
+                "Country Explanation": truncate_cell(country_explanation or "unknown"),
+                "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
+                "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
+                "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
+                "Time cost": truncate_cell(outputs[key]["time_cost"])
+            }
+            #row_score.append(row)
+            # rows.append(list(row.values()))
+            rows.append(row)
+            save_row = {
+            "Sample ID": truncate_cell(label or "unknown"),
+            "Predicted Country": truncate_cell(pred_country or "unknown"),
+            "Country Explanation": truncate_cell(country_explanation or "unknown"),
+            "Predicted Sample Type": truncate_cell(pred_sample or "unknown"),
+            "Sample Type Explanation": truncate_cell(sample_explanation or "unknown"),
+            "Sources": truncate_cell("\n".join(outputs[key]["source"]) or "No Links"),
+            "Query_cost": outputs[key]["query_cost"] or "",
+            "Time cost": outputs[key]["time_cost"] or "",
+            "file_chunk": truncate_cell(outputs[key]["file_chunk"] or ""),
+            "file_all_output": truncate_cell(outputs[key]["file_all_output"] or "")
+            }
+            #row_score.append(row)
+            #save_rows.append(list(save_row.values()))
+            save_rows.append(save_row)
+        print("the final rows: ", rows)
+    try:
+        # Prepare as DataFrame
+        df_new = pd.DataFrame(save_rows)
+        print("done df_new and here are save_rows: ", save_rows)
+        # ✅ Setup Google Sheets
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        spreadsheet = client.open("known_samples")
+        sheet = spreadsheet.sheet1
+        # ✅ Load existing data + headers
+        existing_data = sheet.get_all_values()
+        headers = existing_data[0] if existing_data else []
+        existing_df = pd.DataFrame(existing_data[1:], columns=headers) if len(existing_data) > 1 else pd.DataFrame()
+        # ✅ Extend headers if new keys appear in save_rows
+        print("df_new.col: ", df_new.columns)
+        for col in df_new.columns:
+            print(col)
+            if col not in headers:
+                headers.append(col)
+                # Add new column header in the sheet
+                sheet.update_cell(1, len(headers), col)
+        # ✅ Align DataFrame with sheet headers (fill missing with "")
+        df_new = df_new.reindex(columns=headers, fill_value="")
+        # ✅ Build lookup: Sample ID → row index
+        if "Sample ID" in existing_df.columns:
+            id_to_row = {sid: i + 2 for i, sid in enumerate(existing_df["Sample ID"])}
+        else:
+            id_to_row = {}
+        for _, row in df_new.iterrows():
+            sid = row.get("Sample ID", "")
+            row_values = [truncate_cell(str(row.get(h, ""))) for h in headers]
+            print("row_val of df_new: ", row_values)
+            if sid in id_to_row:
+                # ✅ Update existing row in correct header order
+                sheet.update(f"A{id_to_row[sid]}:{chr(64+len(headers))}{id_to_row[sid]}", [row_values])
+            else:
+                # ✅ Append new row
+                sheet.append_row(row_values)
+        print("✅ Match results safely saved to known_samples with dynamic headers.")
+    except Exception as e:
+        print(f"❌ Failed to update known_samples: {e}")
+    return rows
+def save_to_excel(all_rows, summary_text, flag_text, filename, is_resume=False):
+    df_new = pd.DataFrame(all_rows, columns=[
+        "Sample ID", "Predicted Country", "Country Explanation",
+        "Predicted Sample Type", "Sample Type Explanation",
+        "Sources", "Time cost"
+    ])
+    if is_resume and os.path.exists(filename):
+        try:
+            df_old = pd.read_excel(filename)
+        except Exception as e:
+            print(f"⚠️ Warning reading old Excel file: {e}")
+            df_old = pd.DataFrame(columns=df_new.columns)
+        # Set index and update existing rows
+        df_old.set_index("Sample ID", inplace=True)
+        df_new.set_index("Sample ID", inplace=True)
+        df_old.update(df_new)
+        df_combined = df_old.reset_index()
+    else:
+        # If not resuming or file doesn't exist, just use new rows
+        df_combined = df_new
+    try:
+        df_combined.to_excel(filename, index=False)
+    except Exception as e:
+        print(f"❌ Failed to write Excel file {filename}: {e}")
+# save the batch input in JSON file
+def save_to_json(all_rows, summary_text, flag_text, filename):
+    output_dict = {
+        "Detailed_Results": all_rows#,  # <-- make sure this is a plain list, not a DataFrame
+        # "Summary_Text": summary_text,
+        # "Ancient_Modern_Flag": flag_text
+    }
+    # If all_rows is a DataFrame, convert it
+    if isinstance(all_rows, pd.DataFrame):
+        output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
+    with open(filename, "w") as external_file:
+        json.dump(output_dict, external_file, indent=2)
+# save the batch input in Text file
+def save_to_txt(all_rows, summary_text, flag_text, filename):
+    if isinstance(all_rows, pd.DataFrame):
+        detailed_results = all_rows.to_dict(orient="records")
+    output = ""
+    #output += ",".join(list(detailed_results[0].keys())) + "\n\n"
+    output += ",".join([str(k) for k in detailed_results[0].keys()]) + "\n\n"
+    for r in detailed_results:
+      output += ",".join([str(v) for v in r.values()]) + "\n\n"
+    with open(filename, "w") as f:
+        f.write("=== Detailed Results ===\n")
+        f.write(output + "\n")
+        # f.write("\n=== Summary ===\n")
+        # f.write(summary_text + "\n")
+        # f.write("\n=== Ancient/Modern Flag ===\n")
+        # f.write(flag_text + "\n")
+def save_batch_output(all_rows, output_type, summary_text=None, flag_text=None):
+    tmp_dir = tempfile.mkdtemp()
+    #html_table = all_rows.value  # assuming this is stored somewhere
+    # Parse back to DataFrame
+    #all_rows = pd.read_html(all_rows)[0]  # [0] because read_html returns a list
+    all_rows = pd.read_html(StringIO(all_rows))[0]
+    print(all_rows)
+    if output_type == "Excel":
+        file_path = f"{tmp_dir}/batch_output.xlsx"
+        save_to_excel(all_rows, summary_text, flag_text, file_path)
+    elif output_type == "JSON":
+        file_path = f"{tmp_dir}/batch_output.json"
+        save_to_json(all_rows, summary_text, flag_text, file_path)
+        print("Done with JSON")
+    elif output_type == "TXT":
+        file_path = f"{tmp_dir}/batch_output.txt"
+        save_to_txt(all_rows, summary_text, flag_text, file_path)
+    else:
+        return gr.update(visible=False)  # invalid option
+    return gr.update(value=file_path, visible=True)
+# save cost by checking the known outputs
+# def check_known_output(accession):
+#     if not os.path.exists(KNOWN_OUTPUT_PATH):
+#         return None
+#     try:
+#         df = pd.read_excel(KNOWN_OUTPUT_PATH)
+#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
+#         if match:
+#           accession = match.group(0)
+#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
+#         if not matched.empty:
+#             return matched.iloc[0].to_dict()  # Return the cached row
+#     except Exception as e:
+#         print(f"⚠️ Failed to load known samples: {e}")
+#         return None
+# def check_known_output(accession):
+#     try:
+#         # ✅ Load credentials from Hugging Face secret
+#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+#         client = gspread.authorize(creds)
+#         # ✅ Open the known_samples sheet
+#         spreadsheet = client.open("known_samples")  # Replace with your sheet name
+#         sheet = spreadsheet.sheet1
+#         # ✅ Read all rows
+#         data = sheet.get_all_values()
+#         if not data:
+#             return None
+#         df = pd.DataFrame(data[1:], columns=data[0])  # Skip header row
+#         # ✅ Normalize accession pattern
+#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
+#         if match:
+#             accession = match.group(0)
+#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
+#         if not matched.empty:
+#             return matched.iloc[0].to_dict()
+#     except Exception as e:
+#         print(f"⚠️ Failed to load known samples from Google Sheets: {e}")
+#         return None
+# def check_known_output(accession):
+#     print("inside check known output function")
+#     try:
+#         # ✅ Load credentials from Hugging Face secret
+#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+#         client = gspread.authorize(creds)
+#         spreadsheet = client.open("known_samples")
+#         sheet = spreadsheet.sheet1
+#         data = sheet.get_all_values()
+#         if not data:
+#             print("⚠️ Google Sheet 'known_samples' is empty.")
+#             return None
+#         df = pd.DataFrame(data[1:], columns=data[0])
+#         if "Sample ID" not in df.columns:
+#             print("❌ Column 'Sample ID' not found in Google Sheet.")
+#             return None
+#         match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
+#         if match:
+#             accession = match.group(0)
+#         matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
+#         if not matched.empty:
+#             #return matched.iloc[0].to_dict()
+#             row = matched.iloc[0]
+#             country = row.get("Predicted Country", "").strip().lower()
+#             sample_type = row.get("Predicted Sample Type", "").strip().lower()
+#             if country and country != "unknown" and sample_type and sample_type != "unknown":
+#                 return row.to_dict()
+#             else:
+#                 print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
+#                 return None
+#         else:
+#             print(f"🔍 Accession {accession} not found in known_samples.")
+#             return None
+#     except Exception as e:
+#         import traceback
+#         print("❌ Exception occurred during check_known_output:")
+#         traceback.print_exc()
+#         return None
+import os
+import re
+import json
+import time
+import gspread
+import pandas as pd
+from oauth2client.service_account import ServiceAccountCredentials
+from gspread.exceptions import APIError
+# --- Global cache ---
+_known_samples_cache = None
+def load_known_samples():
+    """Load the Google Sheet 'known_samples' into a Pandas DataFrame and cache it."""
+    global _known_samples_cache
+    try:
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = [
+            'https://spreadsheets.google.com/feeds',
+            'https://www.googleapis.com/auth/drive'
+        ]
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        sheet = client.open("known_samples").sheet1
+        data = sheet.get_all_values()
+        if not data:
+            print("⚠️ Google Sheet 'known_samples' is empty.")
+            _known_samples_cache = pd.DataFrame()
+        else:
+            _known_samples_cache = pd.DataFrame(data[1:], columns=data[0])
+            print(f"✅ Cached {_known_samples_cache.shape[0]} rows from known_samples")
+    except APIError as e:
+        print(f"❌ APIError while loading known_samples: {e}")
+        _known_samples_cache = pd.DataFrame()
+    except Exception as e:
+        import traceback
+        print("❌ Exception occurred while loading known_samples:")
+        traceback.print_exc()
+        _known_samples_cache = pd.DataFrame()
+def check_known_output(accession, niche_cases=None):
+    """Check if an accession exists in the cached 'known_samples' sheet."""
+    global _known_samples_cache
+    print("inside check known output function")
+    try:
+        # Load cache if not already loaded
+        if _known_samples_cache is None:
+            load_known_samples()
+        if _known_samples_cache.empty:
+            print("⚠️ No cached data available.")
+            return None
+        # Extract proper accession format (e.g. AB12345)
+        match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
+        if match:
+            accession = match.group(0)
+        matched = _known_samples_cache[
+            _known_samples_cache["Sample ID"].str.contains(accession, case=False, na=False)
+        ]
+        if not matched.empty:
+            row = matched.iloc[0]
+            country = row.get("Predicted Country", "").strip().lower()
+            sample_type = row.get("Predicted Sample Type", "").strip().lower()
+            output_niche = None
+            if niche_cases:
+                niche_col = "Predicted " + niche_cases[0]
+                if niche_col not in _known_samples_cache.columns:
+                    print(f"⚠️ Niche column '{niche_col}' not found in known_samples. Skipping cache.")
+                    return None
+                output_niche = row.get("Predicted " + niche_cases[0], "").strip().lower()
+                if country and country.lower() not in ["","unknown"] and sample_type and sample_type.lower() not in ["","unknown"] and output_niche and output_niche.lower() not in ["","unknown"]:
+                    print(f"🎯 Found {accession} in cache")
+                    return row.to_dict()
+                else:
+                    print(f"⚠️ Accession {accession} found but country/sample_type unknown or empty.")
+                    return None
+            else:
+                if country and country.lower() not in ["","unknown"] and sample_type and sample_type.lower() not in ["","unknown"]:
+                    print(f"🎯 Found {accession} in cache")
+                    return row.to_dict()
+                else:
+                    print(f"⚠️ Accession {accession} found but country/sample_type unknown or empty.")
+                    return None
+        else:
+            print(f"🔍 Accession {accession} not found in cache.")
+            return None
+    except Exception as e:
+        import traceback
+        print("❌ Exception occurred during check_known_output:")
+        traceback.print_exc()
+        return None
+def hash_user_id(user_input):
+    return hashlib.sha256(user_input.encode()).hexdigest()
+# ✅ Load and save usage count
+# def load_user_usage():
+#     if not os.path.exists(USER_USAGE_TRACK_FILE):
+#         return {}
+#     try:
+#         with open(USER_USAGE_TRACK_FILE, "r") as f:
+#             content = f.read().strip()
+#             if not content:
+#                 return {}  # file is empty
+#             return json.loads(content)
+#     except (json.JSONDecodeError, ValueError):
+#         print("⚠️ Warning: user_usage.json is corrupted or invalid. Resetting.")
+#         return {}  # fallback to empty dict
+# def load_user_usage():
+#     try:
+#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+#         client = gspread.authorize(creds)
+#         sheet = client.open("user_usage_log").sheet1
+#         data = sheet.get_all_records()  # Assumes columns: email, usage_count
+#         usage = {}
+#         for row in data:
+#             email = row.get("email", "").strip().lower()
+#             count = int(row.get("usage_count", 0))
+#             if email:
+#                 usage[email] = count
+#         return usage
+#     except Exception as e:
+#         print(f"⚠️ Failed to load user usage from Google Sheets: {e}")
+#         return {}
+# def load_user_usage():
+#     try:
+#         parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
+#         iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)
+#         found = pipeline.find_drive_file("user_usage_log.json", parent_id=iterate3_id)
+#         if not found:
+#             return {}  # not found, start fresh
+#         #file_id = found[0]["id"]
+#         file_id = found
+#         content = pipeline.download_drive_file_content(file_id)
+#         return json.loads(content.strip()) if content.strip() else {}
+#     except Exception as e:
+#         print(f"⚠️ Failed to load user_usage_log.json from Google Drive: {e}")
+#         return {}
+def load_user_usage():
+    try:
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        sheet = client.open("user_usage_log").sheet1
+        data = sheet.get_all_values()
+        print("data: ", data)
+        print("🧪 Raw header row from sheet:", data[0])
+        print("🧪 Character codes in each header:")
+        for h in data[0]:
+            print([ord(c) for c in h])
+        if not data or len(data) < 2:
+            print("⚠️ Sheet is empty or missing rows.")
+            return {}
+        headers = [h.strip().lower() for h in data[0]]
+        if "email" not in headers or "usage_count" not in headers:
+            print("❌ Header format incorrect. Must have 'email' and 'usage_count'.")
+            return {}
+        permitted_index = headers.index("permitted_samples") if "permitted_samples" in headers else None
+        df = pd.DataFrame(data[1:], columns=headers)
+        usage = {}
+        permitted = {}
+        for _, row in df.iterrows():
+            email = row.get("email", "").strip().lower()
+            try:
+                #count = int(row.get("usage_count", 0))
+                try:
+                    count = int(float(row.get("usage_count", 0)))
+                except Exception:
+                    print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
+                    count = 0
+                if email:
+                    usage[email] = count
+                    if permitted_index is not None:
+                        try:
+                            permitted_count = int(float(row.get("permitted_samples", 50)))
+                            permitted[email] = permitted_count
+                        except:
+                            permitted[email] = 50
+            except ValueError:
+                print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
+        return usage, permitted
+    except Exception as e:
+        print(f"❌ Error in load_user_usage: {e}")
+        return {}, {}
+# def save_user_usage(usage):
+#     with open(USER_USAGE_TRACK_FILE, "w") as f:
+#         json.dump(usage, f, indent=2)
+# def save_user_usage(usage_dict):
+#     try:
+#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+#         client = gspread.authorize(creds)
+#         sheet = client.open("user_usage_log").sheet1
+#         sheet.clear()  # clear old contents first
+#         # Write header + rows
+#         rows = [["email", "usage_count"]] + [[email, count] for email, count in usage_dict.items()]
+#         sheet.update(rows)
+#     except Exception as e:
+#         print(f"❌ Failed to save user usage to Google Sheets: {e}")
+# def save_user_usage(usage_dict):
+#     try:
+#         parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
+#         iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)
+#         import tempfile
+#         tmp_path = os.path.join(tempfile.gettempdir(), "user_usage_log.json")
+#         print("💾 Saving this usage dict:", usage_dict)
+#         with open(tmp_path, "w") as f:
+#             json.dump(usage_dict, f, indent=2)
+#         pipeline.upload_file_to_drive(tmp_path, "user_usage_log.json", iterate3_id)
+#     except Exception as e:
+#         print(f"❌ Failed to save user_usage_log.json to Google Drive: {e}")
+# def save_user_usage(usage_dict):
+#     try:
+#         creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+#         scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+#         creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+#         client = gspread.authorize(creds)
+#         spreadsheet = client.open("user_usage_log")
+#         sheet = spreadsheet.sheet1
+#         # Step 1: Convert new usage to DataFrame
+#         df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
+#         df_new["email"] = df_new["email"].str.strip().str.lower()
+#         # Step 2: Load existing data
+#         existing_data = sheet.get_all_values()
+#         print("🧪 Sheet existing_data:", existing_data)
+#         # Try to load old data
+#         if existing_data and len(existing_data[0]) >= 1:
+#             df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
+#             # Fix missing columns
+#             if "email" not in df_old.columns:
+#                 df_old["email"] = ""
+#             if "usage_count" not in df_old.columns:
+#                 df_old["usage_count"] = 0
+#             df_old["email"] = df_old["email"].str.strip().str.lower()
+#             df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
+#         else:
+#             df_old = pd.DataFrame(columns=["email", "usage_count"])
+#         # Step 3: Merge
+#         df_combined = pd.concat([df_old, df_new], ignore_index=True)
+#         df_combined = df_combined.groupby("email", as_index=False).sum()
+#         # Step 4: Write back
+#         sheet.clear()
+#         sheet.update([df_combined.columns.tolist()] + df_combined.astype(str).values.tolist())
+#         print("✅ Saved user usage to user_usage_log sheet.")
+#     except Exception as e:
+#         print(f"❌ Failed to save user usage to Google Sheets: {e}")
+def save_user_usage(usage_dict):
+    try:
+        creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
+        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
+        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
+        client = gspread.authorize(creds)
+        spreadsheet = client.open("user_usage_log")
+        sheet = spreadsheet.sheet1
+        # Build new df
+        df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
+        df_new["email"] = df_new["email"].str.strip().str.lower()
+        df_new["usage_count"] = pd.to_numeric(df_new["usage_count"], errors="coerce").fillna(0).astype(int)
+        # Read existing data
+        existing_data = sheet.get_all_values()
+        if existing_data and len(existing_data[0]) >= 2:
+            df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
+            df_old["email"] = df_old["email"].str.strip().str.lower()
+            df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
+        else:
+            df_old = pd.DataFrame(columns=["email", "usage_count"])
+        # ✅ Overwrite specific emails only
+        df_old = df_old.set_index("email")
+        for email, count in usage_dict.items():
+            email = email.strip().lower()
+            df_old.loc[email, "usage_count"] = count
+        df_old = df_old.reset_index()
+        # Save
+        sheet.clear()
+        sheet.update([df_old.columns.tolist()] + df_old.astype(str).values.tolist())
+        print("✅ Saved user usage to user_usage_log sheet.")
+    except Exception as e:
+        print(f"❌ Failed to save user usage to Google Sheets: {e}")
+# def increment_usage(user_id, num_samples=1):
+#     usage = load_user_usage()
+#     if user_id not in usage:
+#         usage[user_id] = 0
+#     usage[user_id] += num_samples
+#     save_user_usage(usage)
+#     return usage[user_id]
+# def increment_usage(email: str, count: int):
+#     usage = load_user_usage()
+#     email_key = email.strip().lower()
+#     usage[email_key] = usage.get(email_key, 0) + count
+#     save_user_usage(usage)
+#     return usage[email_key]
+def increment_usage(email: str, count: int = 1):
+    usage, permitted = load_user_usage()
+    email_key = email.strip().lower()
+    #usage[email_key] = usage.get(email_key, 0) + count
+    current = usage.get(email_key, 0)
+    new_value = current + count
+    max_allowed = permitted.get(email_key) or 50
+    usage[email_key] = max(current, new_value)  # ✅ Prevent overwrite with lower
+    print(f"🧪 increment_usage saving: {email_key=} {current=} + {count=} => {usage[email_key]=}")
+    print("max allow is: ", max_allowed)
+    save_user_usage(usage)
+    return usage[email_key], max_allowed
+# run the batch
+def summarize_batch(file=None, raw_text="", resume_file=None, user_email="",
+                    stop_flag=None, output_file_path=None,
+                    limited_acc=50, yield_callback=None):
+    if user_email:
+      limited_acc += 10
+    accessions, error = extract_accessions_from_input(file, raw_text)
+    if error:
+        #return [], "", "", f"Error: {error}"
+        return [], f"Error: {error}", 0, "", ""
+    if resume_file:
+      accessions = get_incomplete_accessions(resume_file)
+    tmp_dir = tempfile.mkdtemp()
+    if not output_file_path:
+      if resume_file:
+        output_file_path = os.path.join(tmp_dir, resume_file)
+      else:
+        output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
+    all_rows = []
+    # all_summaries = []
+    # all_flags = []
+    progress_lines = []
+    warning = ""
+    if len(accessions) > limited_acc:
+      accessions = accessions[:limited_acc]
+      warning = f"Your number of accessions is more than the {limited_acc}, only handle first {limited_acc} accessions"
+    for i, acc in enumerate(accessions):
+        if stop_flag and stop_flag.value:
+            line = f"🛑 Stopped at {acc} ({i+1}/{len(accessions)})"
+            progress_lines.append(line)
+            if yield_callback:
+              yield_callback(line)
+            print("🛑 User requested stop.")
+            break
+        print(f"[{i+1}/{len(accessions)}] Processing {acc}")
+        try:
+            # rows, summary, label, explain = summarize_results(acc)
+            rows = summarize_results(acc)
+            all_rows.extend(rows)
+            # all_summaries.append(f"**{acc}**\n{summary}")
+            # all_flags.append(f"**{acc}**\n### 🏺 Ancient/Modern Flag\n**{label}**\n\n_Explanation:_ {explain}")
+            #save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path)
+            save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path, is_resume=bool(resume_file))
+            line = f"✅ Processed {acc} ({i+1}/{len(accessions)})"
+            progress_lines.append(line)
+            if yield_callback:
+              yield_callback(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
+        except Exception as e:
+            print(f"❌ Failed to process {acc}: {e}")
+            continue
+            #all_summaries.append(f"**{acc}**: Failed - {e}")
+        #progress_lines.append(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
+        limited_acc -= 1
+    """for row in all_rows:
+          source_column = row[2]  # Assuming the "Source" is in the 3rd column (index 2)
+          if source_column.startswith("http"):  # Check if the source is a URL
+              # Wrap it with HTML anchor tags to make it clickable
+              row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
+    if not warning:
+      warning = f"You only have {limited_acc} left"
+    if user_email.strip():
+        user_hash = hash_user_id(user_email)
+        total_queries = increment_usage(user_hash, len(all_rows))
+    else:
+        total_queries = 0
+    yield_callback("✅ Finished!")
+    # summary_text = "\n\n---\n\n".join(all_summaries)
+    # flag_text = "\n\n---\n\n".join(all_flags)
+    #return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)
+    #return all_rows, gr.update(visible=True), gr.update(visible=False)
     return all_rows, output_file_path, total_queries, "\n".join(progress_lines), warning

mtdna_classifier.py CHANGED Viewed

@@ -1,769 +1,769 @@
-# mtDNA Location Classifier MVP (Google Colab)
-# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
-import os
-#import streamlit as st
-import subprocess
-import re
-from Bio import Entrez
-import fitz
-import spacy
-from spacy.cli import download
-from NER.PDF import pdf
-from NER.WordDoc import wordDoc
-from NER.html import extractHTML
-from NER.word2Vec import word2vec
-from transformers import pipeline
-import urllib.parse, requests
-from pathlib import Path
-from upgradeClassify import filter_context_for_sample, infer_location_for_sample
-import model
-# Set your email (required by NCBI Entrez)
-#Entrez.email = "your-email@example.com"
-import nltk
-nltk.download("stopwords")
-nltk.download("punkt")
-nltk.download('punkt_tab')
-# Step 1: Get PubMed ID from Accession using EDirect
-from Bio import Entrez, Medline
-import re
-Entrez.email = "your_email@example.com"
-# --- Helper Functions (Re-organized and Upgraded) ---
-def fetch_ncbi_metadata(accession_number):
-    """
-    Fetches metadata directly from NCBI GenBank using Entrez.
-    Includes robust error handling and improved field extraction.
-    Prioritizes location extraction from geo_loc_name, then notes, then other qualifiers.
-    Also attempts to extract ethnicity and sample_type (ancient/modern).
-    Args:
-        accession_number (str): The NCBI accession number (e.g., "ON792208").
-    Returns:
-        dict: A dictionary containing 'country', 'specific_location', 'ethnicity',
-              'sample_type', 'collection_date', 'isolate', 'title', 'doi', 'pubmed_id'.
-    """
-    Entrez.email = "your.email@example.com" # Required by NCBI, REPLACE WITH YOUR EMAIL
-    country = "unknown"
-    specific_location = "unknown"
-    ethnicity = "unknown"
-    sample_type = "unknown"
-    collection_date = "unknown"
-    isolate = "unknown"
-    title = "unknown"
-    doi = "unknown"
-    pubmed_id = None
-    all_feature = "unknown"
-    KNOWN_COUNTRIES = [
-        "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan",
-        "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi",
-        "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo (Brazzaville)", "Congo (Kinshasa)", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czechia",
-        "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia",
-        "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana",
-        "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan",
-        "Kazakhstan", "Kenya", "Kiribati", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg",
-        "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar",
-        "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Korea", "North Macedonia", "Norway", "Oman",
-        "Pakistan", "Palau", "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda",
-        "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland", "Syria",
-        "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu",
-        "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam",
-        "Yemen", "Zambia", "Zimbabwe"
-    ]
-    COUNTRY_PATTERN = re.compile(r'\b(' + '|'.join(re.escape(c) for c in KNOWN_COUNTRIES) + r')\b', re.IGNORECASE)
-    try:
-        handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
-        record = Entrez.read(handle)
-        handle.close()
-        gb_seq = None
-        # Validate record structure: It should be a list with at least one element (a dict)
-        if isinstance(record, list) and len(record) > 0:
-            if isinstance(record[0], dict):
-                gb_seq = record[0]
-            else:
-                print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
-        else:
-            print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
-        # If gb_seq is still None, return defaults
-        if gb_seq is None:
-            return {"country": "unknown",
-                "specific_location": "unknown",
-                "ethnicity": "unknown",
-                "sample_type": "unknown",
-                "collection_date": "unknown",
-                "isolate": "unknown",
-                "title": "unknown",
-                "doi": "unknown",
-                "pubmed_id": None,
-                "all_features": "unknown"}
-        # If gb_seq is valid, proceed with extraction
-        collection_date = gb_seq.get("GBSeq_create-date","unknown")
-        references = gb_seq.get("GBSeq_references", [])
-        for ref in references:
-            if not pubmed_id:
-                pubmed_id = ref.get("GBReference_pubmed",None)
-            if title == "unknown":
-                title = ref.get("GBReference_title","unknown")
-            for xref in ref.get("GBReference_xref", []):
-                if xref.get("GBXref_dbname") == "doi":
-                    doi = xref.get("GBXref_id")
-                    break
-        features = gb_seq.get("GBSeq_feature-table", [])
-        context_for_flagging = "" # Accumulate text for ancient/modern detection
-        features_context = ""
-        for feature in features:
-            if feature.get("GBFeature_key") == "source":
-                feature_context = ""
-                qualifiers = feature.get("GBFeature_quals", [])
-                found_country = "unknown"
-                found_specific_location = "unknown"
-                found_ethnicity = "unknown"
-                temp_geo_loc_name = "unknown"
-                temp_note_origin_locality = "unknown"
-                temp_country_qual = "unknown"
-                temp_locality_qual = "unknown"
-                temp_collection_location_qual = "unknown"
-                temp_isolation_source_qual = "unknown"
-                temp_env_sample_qual = "unknown"
-                temp_pop_qual = "unknown"
-                temp_organism_qual = "unknown"
-                temp_specimen_qual = "unknown"
-                temp_strain_qual = "unknown"
-                for qual in qualifiers:
-                    qual_name = qual.get("GBQualifier_name")
-                    qual_value = qual.get("GBQualifier_value")
-                    feature_context += qual_name + ": " + qual_value +"\n"
-                    if qual_name == "collection_date":
-                        collection_date = qual_value
-                    elif qual_name == "isolate":
-                        isolate = qual_value
-                    elif qual_name == "population":
-                        temp_pop_qual = qual_value
-                    elif qual_name == "organism":
-                        temp_organism_qual = qual_value
-                    elif qual_name == "specimen_voucher" or qual_name == "specimen":
-                        temp_specimen_qual = qual_value
-                    elif qual_name == "strain":
-                        temp_strain_qual = qual_value
-                    elif qual_name == "isolation_source":
-                        temp_isolation_source_qual = qual_value
-                    elif qual_name == "environmental_sample":
-                        temp_env_sample_qual = qual_value
-                    if qual_name == "geo_loc_name": temp_geo_loc_name = qual_value
-                    elif qual_name == "note":
-                        if qual_value.startswith("origin_locality:"):
-                            temp_note_origin_locality = qual_value
-                        context_for_flagging += qual_value + " " # Capture all notes for flagging
-                    elif qual_name == "country": temp_country_qual = qual_value
-                    elif qual_name == "locality": temp_locality_qual = qual_value
-                    elif qual_name == "collection_location": temp_collection_location_qual = qual_value
-                # --- Aggregate all relevant info into context_for_flagging ---
-                context_for_flagging += f" {isolate} {temp_isolation_source_qual} {temp_specimen_qual} {temp_strain_qual} {temp_organism_qual} {temp_geo_loc_name} {temp_collection_location_qual} {temp_env_sample_qual}"
-                context_for_flagging = context_for_flagging.strip()
-                # --- Determine final country and specific_location based on priority ---
-                if temp_geo_loc_name != "unknown":
-                    parts = [p.strip() for p in temp_geo_loc_name.split(':')]
-                    if len(parts) > 1:
-                      found_specific_location = parts[-1]; found_country = parts[0]
-                    else: found_country = temp_geo_loc_name; found_specific_location = "unknown"
-                elif temp_note_origin_locality != "unknown":
-                    match = re.search(r"origin_locality:\s*(.*)", temp_note_origin_locality, re.IGNORECASE)
-                    if match:
-                        location_string = match.group(1).strip()
-                        parts = [p.strip() for p in location_string.split(':')]
-                        if len(parts) > 1:
-                            #found_country = parts[-1]; found_specific_location = parts[0]
-                            found_country = model.get_country_from_text(temp_note_origin_locality.lower())
-                            if found_country == "unknown":
-                                found_country = parts[0];
-                                found_specific_location = parts[-1]
-                        else: found_country = location_string; found_specific_location = "unknown"
-                elif temp_locality_qual != "unknown":
-                    found_country_match = COUNTRY_PATTERN.search(temp_locality_qual)
-                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_locality_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
-                    else: found_specific_location = temp_locality_qual; found_country = "unknown"
-                elif temp_collection_location_qual != "unknown":
-                    found_country_match = COUNTRY_PATTERN.search(temp_collection_location_qual)
-                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_collection_location_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
-                    else: found_specific_location = temp_collection_location_qual; found_country = "unknown"
-                elif temp_isolation_source_qual != "unknown":
-                    found_country_match = COUNTRY_PATTERN.search(temp_isolation_source_qual)
-                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_isolation_source_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
-                    else: found_specific_location = temp_isolation_source_qual; found_country = "unknown"
-                elif temp_env_sample_qual != "unknown":
-                    found_country_match = COUNTRY_PATTERN.search(temp_env_sample_qual)
-                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_env_sample_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
-                    else: found_specific_location = temp_env_sample_qual; found_country = "unknown"
-                if found_country == "unknown" and temp_country_qual != "unknown":
-                     found_country_match = COUNTRY_PATTERN.search(temp_country_qual)
-                     if found_country_match: found_country = found_country_match.group(1)
-                country = found_country
-                specific_location = found_specific_location
-                # --- Determine final ethnicity ---
-                if temp_pop_qual != "unknown":
-                    found_ethnicity = temp_pop_qual
-                elif isolate != "unknown" and re.fullmatch(r'[A-Za-z\s\-]+', isolate) and get_country_from_text(isolate) == "unknown":
-                     found_ethnicity = isolate
-                elif context_for_flagging != "unknown": # Use the broader context for ethnicity patterns
-                    eth_match = re.search(r'(?:population|ethnicity|isolate source):\s*([A-Za-z\s\-]+)', context_for_flagging, re.IGNORECASE)
-                    if eth_match:
-                        found_ethnicity = eth_match.group(1).strip()
-                ethnicity = found_ethnicity
-                # --- Determine sample_type (ancient/modern) ---
-                if context_for_flagging:
-                    sample_type, explain = detect_ancient_flag(context_for_flagging)
-                features_context += feature_context + "\n"
-                break
-        if specific_location != "unknown" and specific_location.lower() == country.lower():
-            specific_location = "unknown"
-        if not features_context:  features_context = "unknown"
-        return {"country": country.lower(),
-                "specific_location": specific_location.lower(),
-                "ethnicity": ethnicity.lower(),
-                "sample_type": sample_type.lower(),
-                "collection_date": collection_date,
-                "isolate": isolate,
-                "title": title,
-                "doi": doi,
-                "pubmed_id": pubmed_id,
-                "all_features": features_context}
-    except:
-        print(f"Error fetching NCBI data for {accession_number}")
-        return {"country": "unknown",
-                "specific_location": "unknown",
-                "ethnicity": "unknown",
-                "sample_type": "unknown",
-                "collection_date": "unknown",
-                "isolate": "unknown",
-                "title": "unknown",
-                "doi": "unknown",
-                "pubmed_id": None,
-                "all_features": "unknown"}
-# --- Helper function for country matching (re-defined from main code to be self-contained) ---
-_country_keywords = {
-    "thailand": "Thailand", "laos": "Laos", "cambodia": "Cambodia", "myanmar": "Myanmar",
-    "philippines": "Philippines", "indonesia": "Indonesia", "malaysia": "Malaysia",
-    "china": "China", "chinese": "China", "india": "India", "taiwan": "Taiwan",
-    "vietnam": "Vietnam", "russia": "Russia", "siberia": "Russia", "nepal": "Nepal",
-    "japan": "Japan", "sumatra": "Indonesia", "borneu": "Indonesia",
-    "yunnan": "China", "tibet": "China", "northern mindanao": "Philippines",
-    "west malaysia": "Malaysia", "north thailand": "Thailand", "central thailand": "Thailand",
-    "northeast thailand": "Thailand", "east myanmar": "Myanmar", "west thailand": "Thailand",
-    "central india": "India", "east india": "India", "northeast india": "India",
-    "south sibera": "Russia", "mongolia": "China", "beijing": "China", "south korea": "South Korea",
-    "north asia": "unknown", "southeast asia": "unknown", "east asia": "unknown"
-}
-def get_country_from_text(text):
-    text_lower = text.lower()
-    for keyword, country in _country_keywords.items():
-        if keyword in text_lower:
-            return country
-    return "unknown"
-# The result will be seen as manualLink for the function get_paper_text
-# def search_google_custom(query, max_results=3):
-#   # query should be the title from ncbi or paper/source title
-#     GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY"]
-#     GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"]
-#     endpoint = os.environ["SEARCH_ENDPOINT"]
-#     params = {
-#         "key": GOOGLE_CSE_API_KEY,
-#         "cx": GOOGLE_CSE_CX,
-#         "q": query,
-#         "num": max_results
-#     }
-#     try:
-#         response = requests.get(endpoint, params=params)
-#         if response.status_code == 429:
-#             print("Rate limit hit. Try again later.")
-#             return []
-#         response.raise_for_status()
-#         data = response.json().get("items", [])
-#         return [item.get("link") for item in data if item.get("link")]
-#     except Exception as e:
-#         print("Google CSE error:", e)
-#         return []
-def search_google_custom(query, max_results=3):
-  # query should be the title from ncbi or paper/source title
-    GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY"]
-    GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"]
-    endpoint = os.environ["SEARCH_ENDPOINT"]
-    params = {
-        "key": GOOGLE_CSE_API_KEY,
-        "cx": GOOGLE_CSE_CX,
-        "q": query,
-        "num": max_results
-    }
-    try:
-        response = requests.get(endpoint, params=params)
-        if response.status_code == 429:
-            print("Rate limit hit. Try again later.")
-            print("try with back up account")
-            try:
-              return search_google_custom_backup(query, max_results)
-            except:
-              return []
-        response.raise_for_status()
-        data = response.json().get("items", [])
-        return [item.get("link") for item in data if item.get("link")]
-    except Exception as e:
-        print("Google CSE error:", e)
-        return []
-def search_google_custom_backup(query, max_results=3):
-  # query should be the title from ncbi or paper/source title
-    GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY_BACKUP"]
-    GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX_BACKUP"]
-    endpoint = os.environ["SEARCH_ENDPOINT"]
-    params = {
-        "key": GOOGLE_CSE_API_KEY,
-        "cx": GOOGLE_CSE_CX,
-        "q": query,
-        "num": max_results
-    }
-    try:
-        response = requests.get(endpoint, params=params)
-        if response.status_code == 429:
-            print("Rate limit hit. Try again later.")
-            return []
-        response.raise_for_status()
-        data = response.json().get("items", [])
-        return [item.get("link") for item in data if item.get("link")]
-    except Exception as e:
-        print("Google CSE error:", e)
-        return []
-# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
-# Step 3.1: Extract Text
-# sub: download excel file
-def download_excel_file(url, save_path="temp.xlsx"):
-    if "view.officeapps.live.com" in url:
-        parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
-        real_url = urllib.parse.unquote(parsed_url["src"][0])
-        response = requests.get(real_url)
-        with open(save_path, "wb") as f:
-            f.write(response.content)
-        return save_path
-    elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
-        response = requests.get(url)
-        response.raise_for_status()  # Raises error if download fails
-        with open(save_path, "wb") as f:
-            f.write(response.content)
-        return save_path
-    else:
-        print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
-        return url
-def get_paper_text(doi,id,manualLinks=None):
-  # create the temporary folder to contain the texts
-  folder_path = Path("data/"+str(id))
-  if not folder_path.exists():
-      cmd = f'mkdir data/{id}'
-      result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-      print("data/"+str(id) +" created.")
-  else:
-      print("data/"+str(id) +" already exists.")
-  saveLinkFolder = "data/"+id
-  link = 'https://doi.org/' + doi
-  '''textsToExtract = { "doiLink":"paperText"
-                        "file1.pdf":"text1",
-                        "file2.doc":"text2",
-                        "file3.xlsx":excelText3'''
-  textsToExtract = {}
-  # get the file to create listOfFile for each id
-  html = extractHTML.HTML("",link)
-  jsonSM = html.getSupMaterial()
-  text = ""
-  links  = [link] + sum((jsonSM[key] for key in jsonSM),[])
-  if manualLinks != None:
-    links += manualLinks
-  for l in links:
-    # get the main paper
-    name = l.split("/")[-1]
-    file_path = folder_path / name
-    if l == link:
-      text = html.getListSection()
-      textsToExtract[link] = text
-    elif l.endswith(".pdf"):
-      if file_path.is_file():
-          l = saveLinkFolder + "/" + name
-          print("File exists.")
-      p = pdf.PDF(l,saveLinkFolder,doi)
-      f = p.openPDFFile()
-      pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
-      doc = fitz.open(pdf_path)
-      text = "\n".join([page.get_text() for page in doc])
-      textsToExtract[l] = text
-    elif l.endswith(".doc") or l.endswith(".docx"):
-      d = wordDoc.wordDoc(l,saveLinkFolder)
-      text = d.extractTextByPage()
-      textsToExtract[l] = text
-    elif l.split(".")[-1].lower() in "xlsx":
-      wc = word2vec.word2Vec()
-      # download excel file if it not downloaded yet
-      savePath = saveLinkFolder +"/"+ l.split("/")[-1]
-      excelPath = download_excel_file(l, savePath)
-      corpus = wc.tableTransformToCorpusText([],excelPath)
-      text = ''
-      for c in corpus:
-        para = corpus[c]
-        for words in para:
-          text += " ".join(words)
-      textsToExtract[l] = text
-  # delete folder after finishing getting text
-  #cmd = f'rm -r data/{id}'
-  #result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-  return textsToExtract
-# Step 3.2: Extract context
-def extract_context(text, keyword, window=500):
-    # firstly try accession number
-    idx = text.find(keyword)
-    if idx == -1:
-        return "Sample ID not found."
-    return text[max(0, idx-window): idx+window]
-def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
-    if keep_if is None:
-        keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
-    outputs = ""
-    text = text.lower()
-    # If isolate is provided, prioritize paragraphs that mention it
-    # If isolate is provided, prioritize paragraphs that mention it
-    if accession and accession.lower() in text:
-        if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
-            outputs += extract_context(text, accession.lower(), window=700)
-    if isolate and isolate.lower() in text:
-        if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
-            outputs += extract_context(text, isolate.lower(), window=700)
-    for keyword in keep_if:
-        para = extract_context(text, keyword)
-        if para and para not in outputs:
-            outputs += para + "\n"
-    return outputs
-# Step 4: Classification for now (demo purposes)
-# 4.1: Using a HuggingFace model (question-answering)
-def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
-    try:
-        qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
-        result = qa({"context": context, "question": question})
-        return result.get("answer", "Unknown")
-    except Exception as e:
-        return f"Error: {str(e)}"
-# 4.2: Infer from haplogroup
-# Load pre-trained spaCy model for NER
-try:
-    nlp = spacy.load("en_core_web_sm")
-except OSError:
-    download("en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm")
-# Define the haplogroup-to-region mapping (simple rule-based)
-import csv
-def load_haplogroup_mapping(csv_path):
-    mapping = {}
-    with open(csv_path) as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            mapping[row["haplogroup"]] = [row["region"],row["source"]]
-    return mapping
-# Function to extract haplogroup from the text
-def extract_haplogroup(text):
-    match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
-    if match:
-        submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
-        if submatch:
-            return submatch.group(0)
-        else:
-            return match.group(1)  # fallback
-    fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
-    if fallback:
-        return fallback.group(1)
-    return None
-# Function to extract location based on NER
-def extract_location(text):
-    doc = nlp(text)
-    locations = []
-    for ent in doc.ents:
-        if ent.label_ == "GPE":  # GPE = Geopolitical Entity (location)
-            locations.append(ent.text)
-    return locations
-# Function to infer location from haplogroup
-def infer_location_from_haplogroup(haplogroup):
-  haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
-  return haplo_map.get(haplogroup, ["Unknown","Unknown"])
-# Function to classify the mtDNA sample
-def classify_mtDNA_sample_from_haplo(text):
-    # Extract haplogroup
-    haplogroup = extract_haplogroup(text)
-    # Extract location based on NER
-    locations = extract_location(text)
-    # Infer location based on haplogroup
-    inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
-    return {
-        "source":sourceHaplo,
-        "locations_found_in_context": locations,
-        "haplogroup": haplogroup,
-        "inferred_location": inferred_location
-    }
-# 4.3 Get from available NCBI
-def infer_location_fromNCBI(accession):
-    try:
-        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
-        text = handle.read()
-        handle.close()
-        match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
-        if match:
-            return match.group(2), match.group(0)  # This is the value like "Brunei"
-        return "Not found", "Not found"
-    except Exception as e:
-        print("❌ Entrez error:", e)
-        return "Not found", "Not found"
-### ANCIENT/MODERN FLAG
-from Bio import Entrez
-import re
-def flag_ancient_modern(accession, textsToExtract, isolate=None):
-    """
-    Try to classify a sample as Ancient or Modern using:
-    1. NCBI accession (if available)
-    2. Supplementary text or context fallback
-    """
-    context = ""
-    label, explain = "", ""
-    try:
-        # Check if we can fetch metadata from NCBI using the accession
-        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
-        text = handle.read()
-        handle.close()
-        isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
-        if isolate_source:
-            context += isolate_source.group(0) + " "
-        specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
-        if specimen:
-            context += specimen.group(0) + " "
-        if context.strip():
-            label, explain = detect_ancient_flag(context)
-            if label!="Unknown":
-              return label, explain + " from NCBI\n(" + context + ")"
-        # If no useful NCBI metadata, check supplementary texts
-        if textsToExtract:
-            labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
-            for source in textsToExtract:
-                text_block = textsToExtract[source]
-                context = extract_relevant_paragraphs(text_block, accession, isolate=isolate)  # Reduce to informative paragraph(s)
-                label, explain = detect_ancient_flag(context)
-                if label == "Ancient":
-                    labels["ancient"][0] += 1
-                    labels["ancient"][1] += f"{source}:\n{explain}\n\n"
-                elif label == "Modern":
-                    labels["modern"][0] += 1
-                    labels["modern"][1] += f"{source}:\n{explain}\n\n"
-                else:
-                    labels["unknown"] += 1
-            if max(labels["modern"][0],labels["ancient"][0]) > 0:
-                if labels["modern"][0] > labels["ancient"][0]:
-                    return "Modern", labels["modern"][1]
-                else:
-                    return "Ancient", labels["ancient"][1]
-            else:
-              return "Unknown", "No strong keywords detected"
-        else:
-            print("No DOI or PubMed ID available for inference.")
-            return "", ""
-    except Exception as e:
-        print("Error:", e)
-        return "", ""
-def detect_ancient_flag(context_snippet):
-    context = context_snippet.lower()
-    ancient_keywords = [
-        "ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
-        "bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
-        "postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
-        "archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
-    ]
-    modern_keywords = [
-        "modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
-        "genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
-        "we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
-        "control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
-        "bioinformatic analysis", "samples from", "population genetics", "genome-wide data", "imr collection"
-    ]
-    ancient_hits = [k for k in ancient_keywords if k in context]
-    modern_hits = [k for k in modern_keywords if k in context]
-    if ancient_hits and not modern_hits:
-        return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
-    elif modern_hits and not ancient_hits:
-        return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
-    elif ancient_hits and modern_hits:
-        if len(ancient_hits) >= len(modern_hits):
-            return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
-        else:
-            return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
-    # Fallback to QA
-    answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
-    if answer.startswith("Error"):
-        return "Unknown", answer
-    if "ancient" in answer.lower():
-        return "Ancient", f"Leaning ancient based on QA: {answer}"
-    elif "modern" in answer.lower():
-        return "Modern", f"Leaning modern based on QA: {answer}"
-    else:
-        return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
-# STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
-def classify_sample_location(accession):
-  outputs = {}
-  keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
-  # Step 1: get pubmed id and isolate
-  pubmedID, isolate = get_info_from_accession(accession)
-  '''if not pubmedID:
-    return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
-  if not isolate:
-    isolate = "UNKNOWN_ISOLATE"
-  # Step 2: get doi
-  doi = get_doi_from_pubmed_id(pubmedID)
-  '''if not doi:
-    return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
-  # Step 3: get text
-  '''textsToExtract = { "doiLink":"paperText"
-                        "file1.pdf":"text1",
-                        "file2.doc":"text2",
-                        "file3.xlsx":excelText3'''
-  if doi and pubmedID:
-    textsToExtract = get_paper_text(doi,pubmedID)
-  else: textsToExtract = {}
-  '''if not textsToExtract:
-    return {"error": f"No texts extracted for DOI {doi}"}'''
-  if isolate not in [None, "UNKNOWN_ISOLATE"]:
-    label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
-  else:
-    label, explain = flag_ancient_modern(accession,textsToExtract)
-  # Step 4: prediction
-  outputs[accession] = {}
-  outputs[isolate] = {}
-  # 4.0 Infer from NCBI
-  location, outputNCBI = infer_location_fromNCBI(accession)
-  NCBI_result = {
-      "source": "NCBI",
-      "sample_id": accession,
-      "predicted_location": location,
-      "context_snippet": outputNCBI}
-  outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
-  if textsToExtract:
-    long_text = ""
-    for key in textsToExtract:
-      text = textsToExtract[key]
-      # try accession number first
-      outputs[accession][key] = {}
-      keyword = accession
-      context = extract_context(text, keyword, window=500)
-      # 4.1: Using a HuggingFace model (question-answering)
-      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
-      qa_result = {
-          "source": key,
-          "sample_id": keyword,
-          "predicted_location": location,
-          "context_snippet": context
-      }
-      outputs[keyword][key]["QAModel"] = qa_result
-      # 4.2: Infer from haplogroup
-      haplo_result = classify_mtDNA_sample_from_haplo(context)
-      outputs[keyword][key]["haplogroup"] = haplo_result
-      # try isolate
-      keyword = isolate
-      outputs[isolate][key] = {}
-      context = extract_context(text, keyword, window=500)
-      # 4.1.1: Using a HuggingFace model (question-answering)
-      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
-      qa_result = {
-          "source": key,
-          "sample_id": keyword,
-          "predicted_location": location,
-          "context_snippet": context
-      }
-      outputs[keyword][key]["QAModel"] = qa_result
-      # 4.2.1: Infer from haplogroup
-      haplo_result = classify_mtDNA_sample_from_haplo(context)
-      outputs[keyword][key]["haplogroup"] = haplo_result
-      # add long text
-      long_text += text + ". \n"
-    # 4.3: UpgradeClassify
-    # try sample_id as accession number
-    sample_id = accession
-    if sample_id:
-      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
-      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
-      if locations!="No clear location found in top matches":
-        outputs[sample_id]["upgradeClassifier"] = {}
-        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
-          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
-          "sample_id": sample_id,
-          "predicted_location": ", ".join(locations),
-          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
-        }
-    # try sample_id as isolate name
-    sample_id = isolate
-    if sample_id:
-      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
-      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
-      if locations!="No clear location found in top matches":
-        outputs[sample_id]["upgradeClassifier"] = {}
-        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
-          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
-          "sample_id": sample_id,
-          "predicted_location": ", ".join(locations),
-          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
-        }
   return outputs, label, explain

+# mtDNA Location Classifier MVP (Google Colab)
+# Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
+import os
+#import streamlit as st
+import subprocess
+import re
+from Bio import Entrez
+import fitz
+import spacy
+from spacy.cli import download
+from NER.PDF import pdf
+from NER.WordDoc import wordDoc
+from NER.html import extractHTML
+from NER.word2Vec import word2vec
+from transformers import pipeline
+import urllib.parse, requests
+from pathlib import Path
+from upgradeClassify import filter_context_for_sample, infer_location_for_sample
+import model
+# Set your email (required by NCBI Entrez)
+#Entrez.email = "your-email@example.com"
+import nltk
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download('punkt_tab')
+# Step 1: Get PubMed ID from Accession using EDirect
+from Bio import Entrez, Medline
+import re
+Entrez.email = "your_email@example.com"
+# --- Helper Functions (Re-organized and Upgraded) ---
+def fetch_ncbi_metadata(accession_number):
+    """
+    Fetches metadata directly from NCBI GenBank using Entrez.
+    Includes robust error handling and improved field extraction.
+    Prioritizes location extraction from geo_loc_name, then notes, then other qualifiers.
+    Also attempts to extract ethnicity and sample_type (ancient/modern).
+    Args:
+        accession_number (str): The NCBI accession number (e.g., "ON792208").
+    Returns:
+        dict: A dictionary containing 'country', 'specific_location', 'ethnicity',
+              'sample_type', 'collection_date', 'isolate', 'title', 'doi', 'pubmed_id'.
+    """
+    Entrez.email = "your.email@example.com" # Required by NCBI, REPLACE WITH YOUR EMAIL
+    country = "unknown"
+    specific_location = "unknown"
+    ethnicity = "unknown"
+    sample_type = "unknown"
+    collection_date = "unknown"
+    isolate = "unknown"
+    title = "unknown"
+    doi = "unknown"
+    pubmed_id = None
+    all_feature = "unknown"
+    KNOWN_COUNTRIES = [
+        "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan",
+        "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi",
+        "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo (Brazzaville)", "Congo (Kinshasa)", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czechia",
+        "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia",
+        "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana",
+        "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan",
+        "Kazakhstan", "Kenya", "Kiribati", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg",
+        "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar",
+        "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Korea", "North Macedonia", "Norway", "Oman",
+        "Pakistan", "Palau", "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda",
+        "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland", "Syria",
+        "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu",
+        "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam",
+        "Yemen", "Zambia", "Zimbabwe"
+    ]
+    COUNTRY_PATTERN = re.compile(r'\b(' + '|'.join(re.escape(c) for c in KNOWN_COUNTRIES) + r')\b', re.IGNORECASE)
+    try:
+        handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
+        record = Entrez.read(handle)
+        handle.close()
+        gb_seq = None
+        # Validate record structure: It should be a list with at least one element (a dict)
+        if isinstance(record, list) and len(record) > 0:
+            if isinstance(record[0], dict):
+                gb_seq = record[0]
+            else:
+                print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
+        else:
+            print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
+        # If gb_seq is still None, return defaults
+        if gb_seq is None:
+            return {"country": "unknown",
+                "specific_location": "unknown",
+                "ethnicity": "unknown",
+                "sample_type": "unknown",
+                "collection_date": "unknown",
+                "isolate": "unknown",
+                "title": "unknown",
+                "doi": "unknown",
+                "pubmed_id": None,
+                "all_features": "unknown"}
+        # If gb_seq is valid, proceed with extraction
+        collection_date = gb_seq.get("GBSeq_create-date","unknown")
+        references = gb_seq.get("GBSeq_references", [])
+        for ref in references:
+            if not pubmed_id:
+                pubmed_id = ref.get("GBReference_pubmed",None)
+            if title == "unknown":
+                title = ref.get("GBReference_title","unknown")
+            for xref in ref.get("GBReference_xref", []):
+                if xref.get("GBXref_dbname") == "doi":
+                    doi = xref.get("GBXref_id")
+                    break
+        features = gb_seq.get("GBSeq_feature-table", [])
+        context_for_flagging = "" # Accumulate text for ancient/modern detection
+        features_context = ""
+        for feature in features:
+            if feature.get("GBFeature_key") == "source":
+                feature_context = ""
+                qualifiers = feature.get("GBFeature_quals", [])
+                found_country = "unknown"
+                found_specific_location = "unknown"
+                found_ethnicity = "unknown"
+                temp_geo_loc_name = "unknown"
+                temp_note_origin_locality = "unknown"
+                temp_country_qual = "unknown"
+                temp_locality_qual = "unknown"
+                temp_collection_location_qual = "unknown"
+                temp_isolation_source_qual = "unknown"
+                temp_env_sample_qual = "unknown"
+                temp_pop_qual = "unknown"
+                temp_organism_qual = "unknown"
+                temp_specimen_qual = "unknown"
+                temp_strain_qual = "unknown"
+                for qual in qualifiers:
+                    qual_name = qual.get("GBQualifier_name")
+                    qual_value = qual.get("GBQualifier_value")
+                    feature_context += qual_name + ": " + qual_value +"\n"
+                    if qual_name == "collection_date":
+                        collection_date = qual_value
+                    elif qual_name == "isolate":
+                        isolate = qual_value
+                    elif qual_name == "population":
+                        temp_pop_qual = qual_value
+                    elif qual_name == "organism":
+                        temp_organism_qual = qual_value
+                    elif qual_name == "specimen_voucher" or qual_name == "specimen":
+                        temp_specimen_qual = qual_value
+                    elif qual_name == "strain":
+                        temp_strain_qual = qual_value
+                    elif qual_name == "isolation_source":
+                        temp_isolation_source_qual = qual_value
+                    elif qual_name == "environmental_sample":
+                        temp_env_sample_qual = qual_value
+                    if qual_name == "geo_loc_name": temp_geo_loc_name = qual_value
+                    elif qual_name == "note":
+                        if qual_value.startswith("origin_locality:"):
+                            temp_note_origin_locality = qual_value
+                        context_for_flagging += qual_value + " " # Capture all notes for flagging
+                    elif qual_name == "country": temp_country_qual = qual_value
+                    elif qual_name == "locality": temp_locality_qual = qual_value
+                    elif qual_name == "collection_location": temp_collection_location_qual = qual_value
+                # --- Aggregate all relevant info into context_for_flagging ---
+                context_for_flagging += f" {isolate} {temp_isolation_source_qual} {temp_specimen_qual} {temp_strain_qual} {temp_organism_qual} {temp_geo_loc_name} {temp_collection_location_qual} {temp_env_sample_qual}"
+                context_for_flagging = context_for_flagging.strip()
+                # --- Determine final country and specific_location based on priority ---
+                if temp_geo_loc_name != "unknown":
+                    parts = [p.strip() for p in temp_geo_loc_name.split(':')]
+                    if len(parts) > 1:
+                      found_specific_location = parts[-1]; found_country = parts[0]
+                    else: found_country = temp_geo_loc_name; found_specific_location = "unknown"
+                elif temp_note_origin_locality != "unknown":
+                    match = re.search(r"origin_locality:\s*(.*)", temp_note_origin_locality, re.IGNORECASE)
+                    if match:
+                        location_string = match.group(1).strip()
+                        parts = [p.strip() for p in location_string.split(':')]
+                        if len(parts) > 1:
+                            #found_country = parts[-1]; found_specific_location = parts[0]
+                            found_country = model.get_country_from_text(temp_note_origin_locality.lower())
+                            if found_country == "unknown":
+                                found_country = parts[0];
+                                found_specific_location = parts[-1]
+                        else: found_country = location_string; found_specific_location = "unknown"
+                elif temp_locality_qual != "unknown":
+                    found_country_match = COUNTRY_PATTERN.search(temp_locality_qual)
+                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_locality_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
+                    else: found_specific_location = temp_locality_qual; found_country = "unknown"
+                elif temp_collection_location_qual != "unknown":
+                    found_country_match = COUNTRY_PATTERN.search(temp_collection_location_qual)
+                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_collection_location_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
+                    else: found_specific_location = temp_collection_location_qual; found_country = "unknown"
+                elif temp_isolation_source_qual != "unknown":
+                    found_country_match = COUNTRY_PATTERN.search(temp_isolation_source_qual)
+                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_isolation_source_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
+                    else: found_specific_location = temp_isolation_source_qual; found_country = "unknown"
+                elif temp_env_sample_qual != "unknown":
+                    found_country_match = COUNTRY_PATTERN.search(temp_env_sample_qual)
+                    if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_env_sample_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
+                    else: found_specific_location = temp_env_sample_qual; found_country = "unknown"
+                if found_country == "unknown" and temp_country_qual != "unknown":
+                     found_country_match = COUNTRY_PATTERN.search(temp_country_qual)
+                     if found_country_match: found_country = found_country_match.group(1)
+                country = found_country
+                specific_location = found_specific_location
+                # --- Determine final ethnicity ---
+                if temp_pop_qual != "unknown":
+                    found_ethnicity = temp_pop_qual
+                elif isolate != "unknown" and re.fullmatch(r'[A-Za-z\s\-]+', isolate) and get_country_from_text(isolate) == "unknown":
+                     found_ethnicity = isolate
+                elif context_for_flagging != "unknown": # Use the broader context for ethnicity patterns
+                    eth_match = re.search(r'(?:population|ethnicity|isolate source):\s*([A-Za-z\s\-]+)', context_for_flagging, re.IGNORECASE)
+                    if eth_match:
+                        found_ethnicity = eth_match.group(1).strip()
+                ethnicity = found_ethnicity
+                # --- Determine sample_type (ancient/modern) ---
+                if context_for_flagging:
+                    sample_type, explain = detect_ancient_flag(context_for_flagging)
+                features_context += feature_context + "\n"
+                break
+        if specific_location != "unknown" and specific_location.lower() == country.lower():
+            specific_location = "unknown"
+        if not features_context:  features_context = "unknown"
+        return {"country": country.lower(),
+                "specific_location": specific_location.lower(),
+                "ethnicity": ethnicity.lower(),
+                "sample_type": sample_type.lower(),
+                "collection_date": collection_date,
+                "isolate": isolate,
+                "title": title,
+                "doi": doi,
+                "pubmed_id": pubmed_id,
+                "all_features": features_context}
+    except:
+        print(f"Error fetching NCBI data for {accession_number}")
+        return {"country": "unknown",
+                "specific_location": "unknown",
+                "ethnicity": "unknown",
+                "sample_type": "unknown",
+                "collection_date": "unknown",
+                "isolate": "unknown",
+                "title": "unknown",
+                "doi": "unknown",
+                "pubmed_id": None,
+                "all_features": "unknown"}
+# --- Helper function for country matching (re-defined from main code to be self-contained) ---
+_country_keywords = {
+    "thailand": "Thailand", "laos": "Laos", "cambodia": "Cambodia", "myanmar": "Myanmar",
+    "philippines": "Philippines", "indonesia": "Indonesia", "malaysia": "Malaysia",
+    "china": "China", "chinese": "China", "india": "India", "taiwan": "Taiwan",
+    "vietnam": "Vietnam", "russia": "Russia", "siberia": "Russia", "nepal": "Nepal",
+    "japan": "Japan", "sumatra": "Indonesia", "borneu": "Indonesia",
+    "yunnan": "China", "tibet": "China", "northern mindanao": "Philippines",
+    "west malaysia": "Malaysia", "north thailand": "Thailand", "central thailand": "Thailand",
+    "northeast thailand": "Thailand", "east myanmar": "Myanmar", "west thailand": "Thailand",
+    "central india": "India", "east india": "India", "northeast india": "India",
+    "south sibera": "Russia", "mongolia": "China", "beijing": "China", "south korea": "South Korea",
+    "north asia": "unknown", "southeast asia": "unknown", "east asia": "unknown"
+}
+def get_country_from_text(text):
+    text_lower = text.lower()
+    for keyword, country in _country_keywords.items():
+        if keyword in text_lower:
+            return country
+    return "unknown"
+# The result will be seen as manualLink for the function get_paper_text
+# def search_google_custom(query, max_results=3):
+#   # query should be the title from ncbi or paper/source title
+#     GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY"]
+#     GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"]
+#     endpoint = os.environ["SEARCH_ENDPOINT"]
+#     params = {
+#         "key": GOOGLE_CSE_API_KEY,
+#         "cx": GOOGLE_CSE_CX,
+#         "q": query,
+#         "num": max_results
+#     }
+#     try:
+#         response = requests.get(endpoint, params=params)
+#         if response.status_code == 429:
+#             print("Rate limit hit. Try again later.")
+#             return []
+#         response.raise_for_status()
+#         data = response.json().get("items", [])
+#         return [item.get("link") for item in data if item.get("link")]
+#     except Exception as e:
+#         print("Google CSE error:", e)
+#         return []
+def search_google_custom(query, max_results=3):
+  # query should be the title from ncbi or paper/source title
+    GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY"]
+    GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"]
+    endpoint = os.environ["SEARCH_ENDPOINT"]
+    params = {
+        "key": GOOGLE_CSE_API_KEY,
+        "cx": GOOGLE_CSE_CX,
+        "q": query,
+        "num": max_results
+    }
+    try:
+        response = requests.get(endpoint, params=params)
+        if response.status_code == 429:
+            print("Rate limit hit. Try again later.")
+            print("try with back up account")
+            try:
+              return search_google_custom_backup(query, max_results)
+            except:
+              return []
+        response.raise_for_status()
+        data = response.json().get("items", [])
+        return [item.get("link") for item in data if item.get("link")]
+    except Exception as e:
+        print("Google CSE error:", e)
+        return []
+def search_google_custom_backup(query, max_results=3):
+  # query should be the title from ncbi or paper/source title
+    GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY_BACKUP"]
+    GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX_BACKUP"]
+    endpoint = os.environ["SEARCH_ENDPOINT"]
+    params = {
+        "key": GOOGLE_CSE_API_KEY,
+        "cx": GOOGLE_CSE_CX,
+        "q": query,
+        "num": max_results
+    }
+    try:
+        response = requests.get(endpoint, params=params)
+        if response.status_code == 429:
+            print("Rate limit hit. Try again later.")
+            return []
+        response.raise_for_status()
+        data = response.json().get("items", [])
+        return [item.get("link") for item in data if item.get("link")]
+    except Exception as e:
+        print("Google CSE error:", e)
+        return []
+# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
+# Step 3.1: Extract Text
+# sub: download excel file
+def download_excel_file(url, save_path="temp.xlsx"):
+    if "view.officeapps.live.com" in url:
+        parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
+        real_url = urllib.parse.unquote(parsed_url["src"][0])
+        response = requests.get(real_url)
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+        return save_path
+    elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
+        response = requests.get(url)
+        response.raise_for_status()  # Raises error if download fails
+        with open(save_path, "wb") as f:
+            f.write(response.content)
+        return save_path
+    else:
+        print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
+        return url
+def get_paper_text(doi,id,manualLinks=None):
+  # create the temporary folder to contain the texts
+  folder_path = Path("data/"+str(id))
+  if not folder_path.exists():
+      cmd = f'mkdir data/{id}'
+      result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+      print("data/"+str(id) +" created.")
+  else:
+      print("data/"+str(id) +" already exists.")
+  saveLinkFolder = "data/"+id
+  link = 'https://doi.org/' + doi
+  '''textsToExtract = { "doiLink":"paperText"
+                        "file1.pdf":"text1",
+                        "file2.doc":"text2",
+                        "file3.xlsx":excelText3'''
+  textsToExtract = {}
+  # get the file to create listOfFile for each id
+  html = extractHTML.HTML("",link)
+  jsonSM = html.getSupMaterial()
+  text = ""
+  links  = [link] + sum((jsonSM[key] for key in jsonSM),[])
+  if manualLinks != None:
+    links += manualLinks
+  for l in links:
+    # get the main paper
+    name = l.split("/")[-1]
+    file_path = folder_path / name
+    if l == link:
+      text = html.getListSection()
+      textsToExtract[link] = text
+    elif l.endswith(".pdf"):
+      if file_path.is_file():
+          l = saveLinkFolder + "/" + name
+          print("File exists.")
+      p = pdf.PDF(l,saveLinkFolder,doi)
+      f = p.openPDFFile()
+      pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
+      doc = fitz.open(pdf_path)
+      text = "\n".join([page.get_text() for page in doc])
+      textsToExtract[l] = text
+    elif l.endswith(".doc") or l.endswith(".docx"):
+      d = wordDoc.wordDoc(l,saveLinkFolder)
+      text = d.extractTextByPage()
+      textsToExtract[l] = text
+    elif l.split(".")[-1].lower() in "xlsx":
+      wc = word2vec.word2Vec()
+      # download excel file if it not downloaded yet
+      savePath = saveLinkFolder +"/"+ l.split("/")[-1]
+      excelPath = download_excel_file(l, savePath)
+      corpus = wc.tableTransformToCorpusText([],excelPath)
+      text = ''
+      for c in corpus:
+        para = corpus[c]
+        for words in para:
+          text += " ".join(words)
+      textsToExtract[l] = text
+  # delete folder after finishing getting text
+  #cmd = f'rm -r data/{id}'
+  #result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+  return textsToExtract
+# Step 3.2: Extract context
+def extract_context(text, keyword, window=500):
+    # firstly try accession number
+    idx = text.find(keyword)
+    if idx == -1:
+        return "Sample ID not found."
+    return text[max(0, idx-window): idx+window]
+def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
+    if keep_if is None:
+        keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
+    outputs = ""
+    text = text.lower()
+    # If isolate is provided, prioritize paragraphs that mention it
+    # If isolate is provided, prioritize paragraphs that mention it
+    if accession and accession.lower() in text:
+        if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
+            outputs += extract_context(text, accession.lower(), window=700)
+    if isolate and isolate.lower() in text:
+        if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
+            outputs += extract_context(text, isolate.lower(), window=700)
+    for keyword in keep_if:
+        para = extract_context(text, keyword)
+        if para and para not in outputs:
+            outputs += para + "\n"
+    return outputs
+# Step 4: Classification for now (demo purposes)
+# 4.1: Using a HuggingFace model (question-answering)
+def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
+    try:
+        qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+        result = qa({"context": context, "question": question})
+        return result.get("answer", "Unknown")
+    except Exception as e:
+        return f"Error: {str(e)}"
+# 4.2: Infer from haplogroup
+# Load pre-trained spaCy model for NER
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
+# Define the haplogroup-to-region mapping (simple rule-based)
+import csv
+def load_haplogroup_mapping(csv_path):
+    mapping = {}
+    with open(csv_path) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            mapping[row["haplogroup"]] = [row["region"],row["source"]]
+    return mapping
+# Function to extract haplogroup from the text
+def extract_haplogroup(text):
+    match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
+    if match:
+        submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
+        if submatch:
+            return submatch.group(0)
+        else:
+            return match.group(1)  # fallback
+    fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
+    if fallback:
+        return fallback.group(1)
+    return None
+# Function to extract location based on NER
+def extract_location(text):
+    doc = nlp(text)
+    locations = []
+    for ent in doc.ents:
+        if ent.label_ == "GPE":  # GPE = Geopolitical Entity (location)
+            locations.append(ent.text)
+    return locations
+# Function to infer location from haplogroup
+def infer_location_from_haplogroup(haplogroup):
+  haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
+  return haplo_map.get(haplogroup, ["Unknown","Unknown"])
+# Function to classify the mtDNA sample
+def classify_mtDNA_sample_from_haplo(text):
+    # Extract haplogroup
+    haplogroup = extract_haplogroup(text)
+    # Extract location based on NER
+    locations = extract_location(text)
+    # Infer location based on haplogroup
+    inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
+    return {
+        "source":sourceHaplo,
+        "locations_found_in_context": locations,
+        "haplogroup": haplogroup,
+        "inferred_location": inferred_location
+    }
+# 4.3 Get from available NCBI
+def infer_location_fromNCBI(accession):
+    try:
+        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
+        text = handle.read()
+        handle.close()
+        match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
+        if match:
+            return match.group(2), match.group(0)  # This is the value like "Brunei"
+        return "Not found", "Not found"
+    except Exception as e:
+        print("❌ Entrez error:", e)
+        return "Not found", "Not found"
+### ANCIENT/MODERN FLAG
+from Bio import Entrez
+import re
+def flag_ancient_modern(accession, textsToExtract, isolate=None):
+    """
+    Try to classify a sample as Ancient or Modern using:
+    1. NCBI accession (if available)
+    2. Supplementary text or context fallback
+    """
+    context = ""
+    label, explain = "", ""
+    try:
+        # Check if we can fetch metadata from NCBI using the accession
+        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
+        text = handle.read()
+        handle.close()
+        isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
+        if isolate_source:
+            context += isolate_source.group(0) + " "
+        specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
+        if specimen:
+            context += specimen.group(0) + " "
+        if context.strip():
+            label, explain = detect_ancient_flag(context)
+            if label!="Unknown":
+              return label, explain + " from NCBI\n(" + context + ")"
+        # If no useful NCBI metadata, check supplementary texts
+        if textsToExtract:
+            labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
+            for source in textsToExtract:
+                text_block = textsToExtract[source]
+                context = extract_relevant_paragraphs(text_block, accession, isolate=isolate)  # Reduce to informative paragraph(s)
+                label, explain = detect_ancient_flag(context)
+                if label == "Ancient":
+                    labels["ancient"][0] += 1
+                    labels["ancient"][1] += f"{source}:\n{explain}\n\n"
+                elif label == "Modern":
+                    labels["modern"][0] += 1
+                    labels["modern"][1] += f"{source}:\n{explain}\n\n"
+                else:
+                    labels["unknown"] += 1
+            if max(labels["modern"][0],labels["ancient"][0]) > 0:
+                if labels["modern"][0] > labels["ancient"][0]:
+                    return "Modern", labels["modern"][1]
+                else:
+                    return "Ancient", labels["ancient"][1]
+            else:
+              return "Unknown", "No strong keywords detected"
+        else:
+            print("No DOI or PubMed ID available for inference.")
+            return "", ""
+    except Exception as e:
+        print("Error:", e)
+        return "", ""
+def detect_ancient_flag(context_snippet):
+    context = context_snippet.lower()
+    ancient_keywords = [
+        "ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
+        "bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
+        "postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
+        "archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
+    ]
+    modern_keywords = [
+        "modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
+        "genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
+        "we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
+        "control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
+        "bioinformatic analysis", "samples from", "population genetics", "genome-wide data", "imr collection"
+    ]
+    ancient_hits = [k for k in ancient_keywords if k in context]
+    modern_hits = [k for k in modern_keywords if k in context]
+    if ancient_hits and not modern_hits:
+        return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
+    elif modern_hits and not ancient_hits:
+        return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
+    elif ancient_hits and modern_hits:
+        if len(ancient_hits) >= len(modern_hits):
+            return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
+        else:
+            return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
+    # Fallback to QA
+    answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
+    if answer.startswith("Error"):
+        return "Unknown", answer
+    if "ancient" in answer.lower():
+        return "Ancient", f"Leaning ancient based on QA: {answer}"
+    elif "modern" in answer.lower():
+        return "Modern", f"Leaning modern based on QA: {answer}"
+    else:
+        return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
+# STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
+def classify_sample_location(accession):
+  outputs = {}
+  keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
+  # Step 1: get pubmed id and isolate
+  pubmedID, isolate = get_info_from_accession(accession)
+  '''if not pubmedID:
+    return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
+  if not isolate:
+    isolate = "UNKNOWN_ISOLATE"
+  # Step 2: get doi
+  doi = get_doi_from_pubmed_id(pubmedID)
+  '''if not doi:
+    return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
+  # Step 3: get text
+  '''textsToExtract = { "doiLink":"paperText"
+                        "file1.pdf":"text1",
+                        "file2.doc":"text2",
+                        "file3.xlsx":excelText3'''
+  if doi and pubmedID:
+    textsToExtract = get_paper_text(doi,pubmedID)
+  else: textsToExtract = {}
+  '''if not textsToExtract:
+    return {"error": f"No texts extracted for DOI {doi}"}'''
+  if isolate not in [None, "UNKNOWN_ISOLATE"]:
+    label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
+  else:
+    label, explain = flag_ancient_modern(accession,textsToExtract)
+  # Step 4: prediction
+  outputs[accession] = {}
+  outputs[isolate] = {}
+  # 4.0 Infer from NCBI
+  location, outputNCBI = infer_location_fromNCBI(accession)
+  NCBI_result = {
+      "source": "NCBI",
+      "sample_id": accession,
+      "predicted_location": location,
+      "context_snippet": outputNCBI}
+  outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
+  if textsToExtract:
+    long_text = ""
+    for key in textsToExtract:
+      text = textsToExtract[key]
+      # try accession number first
+      outputs[accession][key] = {}
+      keyword = accession
+      context = extract_context(text, keyword, window=500)
+      # 4.1: Using a HuggingFace model (question-answering)
+      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
+      qa_result = {
+          "source": key,
+          "sample_id": keyword,
+          "predicted_location": location,
+          "context_snippet": context
+      }
+      outputs[keyword][key]["QAModel"] = qa_result
+      # 4.2: Infer from haplogroup
+      haplo_result = classify_mtDNA_sample_from_haplo(context)
+      outputs[keyword][key]["haplogroup"] = haplo_result
+      # try isolate
+      keyword = isolate
+      outputs[isolate][key] = {}
+      context = extract_context(text, keyword, window=500)
+      # 4.1.1: Using a HuggingFace model (question-answering)
+      location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
+      qa_result = {
+          "source": key,
+          "sample_id": keyword,
+          "predicted_location": location,
+          "context_snippet": context
+      }
+      outputs[keyword][key]["QAModel"] = qa_result
+      # 4.2.1: Infer from haplogroup
+      haplo_result = classify_mtDNA_sample_from_haplo(context)
+      outputs[keyword][key]["haplogroup"] = haplo_result
+      # add long text
+      long_text += text + ". \n"
+    # 4.3: UpgradeClassify
+    # try sample_id as accession number
+    sample_id = accession
+    if sample_id:
+      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
+      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
+      if locations!="No clear location found in top matches":
+        outputs[sample_id]["upgradeClassifier"] = {}
+        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
+          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
+          "sample_id": sample_id,
+          "predicted_location": ", ".join(locations),
+          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
+        }
+    # try sample_id as isolate name
+    sample_id = isolate
+    if sample_id:
+      filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
+      locations = infer_location_for_sample(sample_id.upper(), filtered_context)
+      if locations!="No clear location found in top matches":
+        outputs[sample_id]["upgradeClassifier"] = {}
+        outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
+          "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
+          "sample_id": sample_id,
+          "predicted_location": ", ".join(locations),
+          "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
+        }
   return outputs, label, explain

pipeline.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

smart_fallback.py CHANGED Viewed

@@ -1,402 +1,402 @@
-from Bio import Entrez, Medline
-#import model
-import mtdna_classifier
-from NER.html import extractHTML
-import data_preprocess
-import pipeline
-import aiohttp
-import asyncio
-# Setup
-def fetch_ncbi(accession_number):
-  try:
-    Entrez.email = "your.email@example.com" # Required by NCBI, REPLACE WITH YOUR EMAIL
-    handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
-    record = Entrez.read(handle)
-    handle.close()
-    outputs = {"authors":"unknown",
-              "institution":"unknown",
-              "isolate":"unknown",
-              "definition":"unknown",
-              "title":"unknown",
-              "seq_comment":"unknown",
-              "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
-    gb_seq = None
-    # Validate record structure: It should be a list with at least one element (a dict)
-    if isinstance(record, list) and len(record) > 0:
-        if isinstance(record[0], dict):
-            gb_seq = record[0]
-        else:
-            print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
-        # extract collection date
-        if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
-          outputs["collection_date"] = gb_seq["GBSeq_create-date"]
-        else:
-          if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
-            outputs["collection_date"] = gb_seq["GBSeq_update-date"]
-        # extract definition
-        if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
-          outputs["definition"] = gb_seq["GBSeq_definition"]
-        # extract related-reference things
-        if "GBSeq_references" in gb_seq:
-          for ref in gb_seq["GBSeq_references"]:
-            # extract authors
-            if "GBReference_authors" in ref and outputs["authors"]=="unknown":
-              outputs["authors"] = "and ".join(ref["GBReference_authors"])
-            # extract title
-            if "GBReference_title" in ref and outputs["title"]=="unknown":
-              outputs["title"] = ref["GBReference_title"]
-            #  extract submitted journal
-            if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
-              outputs["institution"] = ref['GBReference_journal']
-        # extract seq_comment
-        if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
-          outputs["seq_comment"] = gb_seq["GBSeq_comment"]
-        # extract isolate
-        if "GBSeq_feature-table" in gb_seq:
-          if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
-            for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
-              if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
-                outputs["isolate"] = ref["GBQualifier_value"]
-    else:
-        print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
-    # If gb_seq is still None, return defaults
-    if gb_seq is None:
-        return {"authors":"unknown",
-              "institution":"unknown",
-              "isolate":"unknown",
-              "definition":"unknown",
-              "title":"unknown",
-              "seq_comment":"unknown",
-              "collection_date":"unknown" }
-    return outputs
-  except:
-    print("error in fetching ncbi data")
-    return {"authors":"unknown",
-              "institution":"unknown",
-              "isolate":"unknown",
-              "definition":"unknown",
-              "title":"unknown",
-              "seq_comment":"unknown",
-              "collection_date":"unknown" }
-# Fallback if NCBI crashed or cannot find accession on NBCI
-def google_accession_search(accession_id):
-    """
-    Search for metadata by accession ID using Google Custom Search.
-    Falls back to known biological databases and archives.
-    """
-    queries = [
-        f"{accession_id}",
-        f"{accession_id} site:ncbi.nlm.nih.gov",
-        f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
-        f"{accession_id} site:europepmc.org",
-        f"{accession_id} site:researchgate.net",
-        f"{accession_id} mtDNA",
-        f"{accession_id} mitochondrial DNA"
-    ]
-    links = []
-    for query in queries:
-        search_results = mtdna_classifier.search_google_custom(query, 2)
-        for link in search_results:
-            if link not in links:
-                links.append(link)
-    return links
-# Method 1: Smarter Google
-def smart_google_queries(metadata: dict):
-    queries = []
-    # Extract useful fields
-    isolate = metadata.get("isolate")
-    author = metadata.get("authors")
-    institution = metadata.get("institution")
-    title = metadata.get("title")
-    combined = []
-    # Construct queries
-    if isolate and isolate!="unknown" and isolate!="Unpublished":
-        queries.append(f'"{isolate}" mitochondrial DNA')
-        queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
-    if author and author!="unknown" and author!="Unpublished":
-        # try:
-        #   author_name = ".".join(author.split(' ')[0].split(".")[:-1])  # Use last name only
-        # except:
-        #   try:
-        #     author_name = author.split(',')[0]  # Use last name only
-        #   except:
-        #     author_name = author
-        try:
-            author_name = author.split(',')[0]  # Use last name only
-        except:
-            author_name = author
-        queries.append(f'"{author_name}" mitochondrial DNA')
-        queries.append(f'"{author_name}" mtDNA site:researchgate.net')
-    if institution and institution!="unknown" and institution!="Unpublished":
-        try:
-          short_inst = ",".join(institution.split(',')[:2])  # Take first part of institution
-        except:
-          try:
-            short_inst = institution.split(',')[0]
-          except:
-            short_inst = institution
-        queries.append(f'"{short_inst}" mtDNA sequence')
-        #queries.append(f'"{short_inst}" isolate site:nature.com')
-    if title and title!='unknown' and title!="Unpublished":
-      if title!="Direct Submission":
-        queries.append(title)
-    return queries
-# def filter_links_by_metadata(search_results, saveLinkFolder, accession=None, stop_flag=None):
-#     TRUSTED_DOMAINS = [
-#     "ncbi.nlm.nih.gov",
-#     "pubmed.ncbi.nlm.nih.gov",
-#     "pmc.ncbi.nlm.nih.gov",
-#     "biorxiv.org",
-#     "researchgate.net",
-#     "nature.com",
-#     "sciencedirect.com"
-#     ]
-#     if stop_flag is not None and stop_flag.value:
-#         print(f"🛑 Stop detected {accession}, aborting early...")
-#         return []
-#     def is_trusted_link(link):
-#       for domain in TRUSTED_DOMAINS:
-#         if domain in link:
-#           return True
-#       return False
-#     def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
-#       output = []
-#       keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
-#       if accession:
-#         keywords = [accession] + keywords
-#       title_snippet = link.lower()
-#       print("save link folder inside this filter function: ", saveLinkFolder)
-#       success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
-#       if stop_flag is not None and stop_flag.value:
-#         print(f"🛑 Stop detected {accession}, aborting early...")
-#         return []
-#       if success_process:
-#           article_text = output_process
-#           print("yes succeed for getting article text")
-#       else:
-#           print("no suceed, fallback to no link")
-#           article_text = ""
-#       #article_text = data_preprocess.extract_text(link,saveLinkFolder)
-#       print("article text")
-#       #print(article_text)
-#       if stop_flag is not None and stop_flag.value:
-#         print(f"🛑 Stop detected {accession}, aborting early...")
-#         return []
-#       try:
-#         ext = link.split(".")[-1].lower()
-#         if ext not in ["pdf", "docx", "xlsx"]:
-#             html = extractHTML.HTML("", link)
-#             if stop_flag is not None and stop_flag.value:
-#                 print(f"🛑 Stop detected {accession}, aborting early...")
-#                 return []
-#             jsonSM = html.getSupMaterial()
-#             if jsonSM:
-#                 output += sum((jsonSM[key] for key in jsonSM), [])
-#       except Exception:
-#         pass  # continue silently
-#       for keyword in keywords:
-#         if keyword.lower() in article_text.lower():
-#           if link not in output:
-#             output.append([link,keyword.lower()])
-#           print("link and keyword for article text: ", link, keyword)
-#           return output
-#         if keyword.lower() in title_snippet.lower():
-#           if link not in output:
-#             output.append([link,keyword.lower()])
-#           print("link and keyword for title: ", link, keyword)
-#           return output
-#       return output
-#     filtered = []
-#     better_filter = []
-#     if len(search_results) > 0:
-#       for link in search_results:
-#           # if is_trusted_link(link):
-#           #   if link not in filtered:
-#           #     filtered.append(link)
-#           # else:
-#           print(link)
-#           if stop_flag is not None and stop_flag.value:
-#             print(f"🛑 Stop detected {accession}, aborting early...")
-#             return []
-#           if link:
-#             output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
-#             print("output link: ")
-#             print(output_link)
-#             for out_link in output_link:
-#               if isinstance(out_link,list) and len(out_link) > 1:
-#                 print(out_link)
-#                 kw = out_link[1]
-#                 print("kw and acc: ", kw, accession.lower())
-#                 if accession and kw == accession.lower():
-#                   better_filter.append(out_link[0])
-#                 filtered.append(out_link[0])
-#               else: filtered.append(out_link)
-#           print("done with link and here is filter: ",filtered)
-#     if better_filter:
-#       filtered = better_filter
-#     return filtered
-async def process_link(session, link, saveLinkFolder, keywords, accession):
-    output = []
-    title_snippet = link.lower()
-    # use async extractor for web, fallback to sync for local files
-    if link.startswith("http"):
-        article_text = await data_preprocess.async_extract_text(link, saveLinkFolder)
-    else:
-        article_text = data_preprocess.extract_text(link, saveLinkFolder)
-    for keyword in keywords:
-        if article_text and keyword.lower() in article_text.lower():
-            output.append([link, keyword.lower(), article_text])
-            return output
-        if keyword.lower() in title_snippet:
-            output.append([link, keyword.lower()])
-            return output
-    return output
-async def async_filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
-    TRUSTED_DOMAINS = [
-        "ncbi.nlm.nih.gov", "pubmed.ncbi.nlm.nih.gov", "pmc.ncbi.nlm.nih.gov",
-        "biorxiv.org", "researchgate.net", "nature.com", "sciencedirect.com"
-    ]
-    keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
-    if accession:
-        keywords = [accession] + keywords
-    filtered, better_filter = {}, {}
-    print("before doing session")
-    async with aiohttp.ClientSession() as session:
-        tasks = []
-        for link in search_results:
-            if link:
-                print("link: ", link)
-                tasks.append(process_link(session, link, saveLinkFolder, keywords, accession))
-                print("done")
-        results = await asyncio.gather(*tasks)
-        print("outside session")
-    # merge results
-    for output_link in results:
-        for out_link in output_link:
-            if isinstance(out_link, list) and len(out_link) > 1:
-                kw = out_link[1]
-                if accession and kw == accession.lower():
-                    if len(out_link) == 2:
-                        better_filter[out_link[0]] = ""
-                    elif len(out_link) == 3:
-                        better_filter[out_link[0]] = out_link[2]
-                if len(out_link) == 2:
-                    better_filter[out_link[0]] = ""
-                elif len(out_link) == 3:
-                    better_filter[out_link[0]] = out_link[2]
-            else:
-                filtered[out_link] = ""
-    return better_filter or filtered
-def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
-    TRUSTED_DOMAINS = [
-    "ncbi.nlm.nih.gov",
-    "pubmed.ncbi.nlm.nih.gov",
-    "pmc.ncbi.nlm.nih.gov",
-    "biorxiv.org",
-    "researchgate.net",
-    "nature.com",
-    "sciencedirect.com"
-    ]
-    def is_trusted_link(link):
-      for domain in TRUSTED_DOMAINS:
-        if domain in link:
-          return True
-      return False
-    def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
-      output = []
-      keywords = ["mtDNA", "mitochondrial", "Homo sapiens"]
-      #keywords = ["mtDNA", "mitochondrial"]
-      if accession:
-        keywords = [accession] + keywords
-      title_snippet = link.lower()
-      #print("save link folder inside this filter function: ", saveLinkFolder)
-      article_text = data_preprocess.extract_text(link,saveLinkFolder)
-      print("article text done")
-      #print(article_text)
-      try:
-        ext = link.split(".")[-1].lower()
-        if ext not in ["pdf", "docx", "xlsx"]:
-            html = extractHTML.HTML("", link)
-            jsonSM = html.getSupMaterial()
-            if jsonSM:
-                output += sum((jsonSM[key] for key in jsonSM), [])
-      except Exception:
-        pass  # continue silently
-      for keyword in keywords:
-        if article_text:
-          if keyword.lower() in article_text.lower():
-            if link not in output:
-              output.append([link,keyword.lower(), article_text])
-            return output
-        if keyword.lower() in title_snippet.lower():
-          if link not in output:
-            output.append([link,keyword.lower()])
-          print("link and keyword for title: ", link, keyword)
-          return output
-      return output
-    filtered = {}
-    better_filter = {}
-    if len(search_results) > 0:
-      print(search_results)
-      for link in search_results:
-          # if is_trusted_link(link):
-          #   if link not in filtered:
-          #     filtered.append(link)
-          # else:
-          print(link)
-          if link:
-            output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
-            print("output link: ")
-            print(output_link)
-            for out_link in output_link:
-              if isinstance(out_link,list) and len(out_link) > 1:
-                print(out_link)
-                kw = out_link[1]
-                if accession and kw == accession.lower():
-                  if len(out_link) == 2:
-                    better_filter[out_link[0]] = ""
-                  elif len(out_link) == 3:
-                    # save article
-                    better_filter[out_link[0]] = out_link[2]
-                if len(out_link) == 2:
-                  better_filter[out_link[0]] = ""
-                elif len(out_link) == 3:
-                  # save article
-                  better_filter[out_link[0]] = out_link[2]
-              else: filtered[out_link] = ""
-          print("done with link and here is filter: ",filtered)
-    if better_filter:
-      filtered = better_filter
-    return filtered
-def smart_google_search(metadata):
-  queries = smart_google_queries(metadata)
-  links = []
-  for q in queries:
-      #print("\n🔍 Query:", q)
-      results = mtdna_classifier.search_google_custom(q,2)
-      for link in results:
-          #print(f"- {link}")
-          if link not in links:
-              links.append(link)
-  #filter_links = filter_links_by_metadata(links)
-  return links
-# Method 2: Prompt LLM better or better ai search api with all
 # the total information from even ncbi and all search

+from Bio import Entrez, Medline
+#import model
+import mtdna_classifier
+from NER.html import extractHTML
+import data_preprocess
+import pipeline
+import aiohttp
+import asyncio
+# Setup
+def fetch_ncbi(accession_number):
+  try:
+    Entrez.email = "your.email@example.com" # Required by NCBI, REPLACE WITH YOUR EMAIL
+    handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
+    record = Entrez.read(handle)
+    handle.close()
+    outputs = {"authors":"unknown",
+              "institution":"unknown",
+              "isolate":"unknown",
+              "definition":"unknown",
+              "title":"unknown",
+              "seq_comment":"unknown",
+              "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
+    gb_seq = None
+    # Validate record structure: It should be a list with at least one element (a dict)
+    if isinstance(record, list) and len(record) > 0:
+        if isinstance(record[0], dict):
+            gb_seq = record[0]
+        else:
+            print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
+        # extract collection date
+        if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
+          outputs["collection_date"] = gb_seq["GBSeq_create-date"]
+        else:
+          if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
+            outputs["collection_date"] = gb_seq["GBSeq_update-date"]
+        # extract definition
+        if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
+          outputs["definition"] = gb_seq["GBSeq_definition"]
+        # extract related-reference things
+        if "GBSeq_references" in gb_seq:
+          for ref in gb_seq["GBSeq_references"]:
+            # extract authors
+            if "GBReference_authors" in ref and outputs["authors"]=="unknown":
+              outputs["authors"] = "and ".join(ref["GBReference_authors"])
+            # extract title
+            if "GBReference_title" in ref and outputs["title"]=="unknown":
+              outputs["title"] = ref["GBReference_title"]
+            #  extract submitted journal
+            if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
+              outputs["institution"] = ref['GBReference_journal']
+        # extract seq_comment
+        if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
+          outputs["seq_comment"] = gb_seq["GBSeq_comment"]
+        # extract isolate
+        if "GBSeq_feature-table" in gb_seq:
+          if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
+            for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
+              if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
+                outputs["isolate"] = ref["GBQualifier_value"]
+    else:
+        print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
+    # If gb_seq is still None, return defaults
+    if gb_seq is None:
+        return {"authors":"unknown",
+              "institution":"unknown",
+              "isolate":"unknown",
+              "definition":"unknown",
+              "title":"unknown",
+              "seq_comment":"unknown",
+              "collection_date":"unknown" }
+    return outputs
+  except:
+    print("error in fetching ncbi data")
+    return {"authors":"unknown",
+              "institution":"unknown",
+              "isolate":"unknown",
+              "definition":"unknown",
+              "title":"unknown",
+              "seq_comment":"unknown",
+              "collection_date":"unknown" }
+# Fallback if NCBI crashed or cannot find accession on NBCI
+def google_accession_search(accession_id):
+    """
+    Search for metadata by accession ID using Google Custom Search.
+    Falls back to known biological databases and archives.
+    """
+    queries = [
+        f"{accession_id}",
+        f"{accession_id} site:ncbi.nlm.nih.gov",
+        f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
+        f"{accession_id} site:europepmc.org",
+        f"{accession_id} site:researchgate.net",
+        f"{accession_id} mtDNA",
+        f"{accession_id} mitochondrial DNA"
+    ]
+    links = []
+    for query in queries:
+        search_results = mtdna_classifier.search_google_custom(query, 2)
+        for link in search_results:
+            if link not in links:
+                links.append(link)
+    return links
+# Method 1: Smarter Google
+def smart_google_queries(metadata: dict):
+    queries = []
+    # Extract useful fields
+    isolate = metadata.get("isolate")
+    author = metadata.get("authors")
+    institution = metadata.get("institution")
+    title = metadata.get("title")
+    combined = []
+    # Construct queries
+    if isolate and isolate!="unknown" and isolate!="Unpublished":
+        queries.append(f'"{isolate}" mitochondrial DNA')
+        queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
+    if author and author!="unknown" and author!="Unpublished":
+        # try:
+        #   author_name = ".".join(author.split(' ')[0].split(".")[:-1])  # Use last name only
+        # except:
+        #   try:
+        #     author_name = author.split(',')[0]  # Use last name only
+        #   except:
+        #     author_name = author
+        try:
+            author_name = author.split(',')[0]  # Use last name only
+        except:
+            author_name = author
+        queries.append(f'"{author_name}" mitochondrial DNA')
+        queries.append(f'"{author_name}" mtDNA site:researchgate.net')
+    if institution and institution!="unknown" and institution!="Unpublished":
+        try:
+          short_inst = ",".join(institution.split(',')[:2])  # Take first part of institution
+        except:
+          try:
+            short_inst = institution.split(',')[0]
+          except:
+            short_inst = institution
+        queries.append(f'"{short_inst}" mtDNA sequence')
+        #queries.append(f'"{short_inst}" isolate site:nature.com')
+    if title and title!='unknown' and title!="Unpublished":
+      if title!="Direct Submission":
+        queries.append(title)
+    return queries
+# def filter_links_by_metadata(search_results, saveLinkFolder, accession=None, stop_flag=None):
+#     TRUSTED_DOMAINS = [
+#     "ncbi.nlm.nih.gov",
+#     "pubmed.ncbi.nlm.nih.gov",
+#     "pmc.ncbi.nlm.nih.gov",
+#     "biorxiv.org",
+#     "researchgate.net",
+#     "nature.com",
+#     "sciencedirect.com"
+#     ]
+#     if stop_flag is not None and stop_flag.value:
+#         print(f"🛑 Stop detected {accession}, aborting early...")
+#         return []
+#     def is_trusted_link(link):
+#       for domain in TRUSTED_DOMAINS:
+#         if domain in link:
+#           return True
+#       return False
+#     def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
+#       output = []
+#       keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
+#       if accession:
+#         keywords = [accession] + keywords
+#       title_snippet = link.lower()
+#       print("save link folder inside this filter function: ", saveLinkFolder)
+#       success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
+#       if stop_flag is not None and stop_flag.value:
+#         print(f"🛑 Stop detected {accession}, aborting early...")
+#         return []
+#       if success_process:
+#           article_text = output_process
+#           print("yes succeed for getting article text")
+#       else:
+#           print("no suceed, fallback to no link")
+#           article_text = ""
+#       #article_text = data_preprocess.extract_text(link,saveLinkFolder)
+#       print("article text")
+#       #print(article_text)
+#       if stop_flag is not None and stop_flag.value:
+#         print(f"🛑 Stop detected {accession}, aborting early...")
+#         return []
+#       try:
+#         ext = link.split(".")[-1].lower()
+#         if ext not in ["pdf", "docx", "xlsx"]:
+#             html = extractHTML.HTML("", link)
+#             if stop_flag is not None and stop_flag.value:
+#                 print(f"🛑 Stop detected {accession}, aborting early...")
+#                 return []
+#             jsonSM = html.getSupMaterial()
+#             if jsonSM:
+#                 output += sum((jsonSM[key] for key in jsonSM), [])
+#       except Exception:
+#         pass  # continue silently
+#       for keyword in keywords:
+#         if keyword.lower() in article_text.lower():
+#           if link not in output:
+#             output.append([link,keyword.lower()])
+#           print("link and keyword for article text: ", link, keyword)
+#           return output
+#         if keyword.lower() in title_snippet.lower():
+#           if link not in output:
+#             output.append([link,keyword.lower()])
+#           print("link and keyword for title: ", link, keyword)
+#           return output
+#       return output
+#     filtered = []
+#     better_filter = []
+#     if len(search_results) > 0:
+#       for link in search_results:
+#           # if is_trusted_link(link):
+#           #   if link not in filtered:
+#           #     filtered.append(link)
+#           # else:
+#           print(link)
+#           if stop_flag is not None and stop_flag.value:
+#             print(f"🛑 Stop detected {accession}, aborting early...")
+#             return []
+#           if link:
+#             output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
+#             print("output link: ")
+#             print(output_link)
+#             for out_link in output_link:
+#               if isinstance(out_link,list) and len(out_link) > 1:
+#                 print(out_link)
+#                 kw = out_link[1]
+#                 print("kw and acc: ", kw, accession.lower())
+#                 if accession and kw == accession.lower():
+#                   better_filter.append(out_link[0])
+#                 filtered.append(out_link[0])
+#               else: filtered.append(out_link)
+#           print("done with link and here is filter: ",filtered)
+#     if better_filter:
+#       filtered = better_filter
+#     return filtered
+async def process_link(session, link, saveLinkFolder, keywords, accession):
+    output = []
+    title_snippet = link.lower()
+    # use async extractor for web, fallback to sync for local files
+    if link.startswith("http"):
+        article_text = await data_preprocess.async_extract_text(link, saveLinkFolder)
+    else:
+        article_text = data_preprocess.extract_text(link, saveLinkFolder)
+    for keyword in keywords:
+        if article_text and keyword.lower() in article_text.lower():
+            output.append([link, keyword.lower(), article_text])
+            return output
+        if keyword.lower() in title_snippet:
+            output.append([link, keyword.lower()])
+            return output
+    return output
+async def async_filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
+    TRUSTED_DOMAINS = [
+        "ncbi.nlm.nih.gov", "pubmed.ncbi.nlm.nih.gov", "pmc.ncbi.nlm.nih.gov",
+        "biorxiv.org", "researchgate.net", "nature.com", "sciencedirect.com"
+    ]
+    keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
+    if accession:
+        keywords = [accession] + keywords
+    filtered, better_filter = {}, {}
+    print("before doing session")
+    async with aiohttp.ClientSession() as session:
+        tasks = []
+        for link in search_results:
+            if link:
+                print("link: ", link)
+                tasks.append(process_link(session, link, saveLinkFolder, keywords, accession))
+                print("done")
+        results = await asyncio.gather(*tasks)
+        print("outside session")
+    # merge results
+    for output_link in results:
+        for out_link in output_link:
+            if isinstance(out_link, list) and len(out_link) > 1:
+                kw = out_link[1]
+                if accession and kw == accession.lower():
+                    if len(out_link) == 2:
+                        better_filter[out_link[0]] = ""
+                    elif len(out_link) == 3:
+                        better_filter[out_link[0]] = out_link[2]
+                if len(out_link) == 2:
+                    better_filter[out_link[0]] = ""
+                elif len(out_link) == 3:
+                    better_filter[out_link[0]] = out_link[2]
+            else:
+                filtered[out_link] = ""
+    return better_filter or filtered
+def filter_links_by_metadata(search_results, saveLinkFolder, accession=None):
+    TRUSTED_DOMAINS = [
+    "ncbi.nlm.nih.gov",
+    "pubmed.ncbi.nlm.nih.gov",
+    "pmc.ncbi.nlm.nih.gov",
+    "biorxiv.org",
+    "researchgate.net",
+    "nature.com",
+    "sciencedirect.com"
+    ]
+    def is_trusted_link(link):
+      for domain in TRUSTED_DOMAINS:
+        if domain in link:
+          return True
+      return False
+    def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
+      output = []
+      keywords = ["mtDNA", "mitochondrial", "Homo sapiens"]
+      #keywords = ["mtDNA", "mitochondrial"]
+      if accession:
+        keywords = [accession] + keywords
+      title_snippet = link.lower()
+      #print("save link folder inside this filter function: ", saveLinkFolder)
+      article_text = data_preprocess.extract_text(link,saveLinkFolder)
+      print("article text done")
+      #print(article_text)
+      try:
+        ext = link.split(".")[-1].lower()
+        if ext not in ["pdf", "docx", "xlsx"]:
+            html = extractHTML.HTML("", link)
+            jsonSM = html.getSupMaterial()
+            if jsonSM:
+                output += sum((jsonSM[key] for key in jsonSM), [])
+      except Exception:
+        pass  # continue silently
+      for keyword in keywords:
+        if article_text:
+          if keyword.lower() in article_text.lower():
+            if link not in output:
+              output.append([link,keyword.lower(), article_text])
+            return output
+        if keyword.lower() in title_snippet.lower():
+          if link not in output:
+            output.append([link,keyword.lower()])
+          print("link and keyword for title: ", link, keyword)
+          return output
+      return output
+    filtered = {}
+    better_filter = {}
+    if len(search_results) > 0:
+      print(search_results)
+      for link in search_results:
+          # if is_trusted_link(link):
+          #   if link not in filtered:
+          #     filtered.append(link)
+          # else:
+          print(link)
+          if link:
+            output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
+            print("output link: ")
+            print(output_link)
+            for out_link in output_link:
+              if isinstance(out_link,list) and len(out_link) > 1:
+                print(out_link)
+                kw = out_link[1]
+                if accession and kw == accession.lower():
+                  if len(out_link) == 2:
+                    better_filter[out_link[0]] = ""
+                  elif len(out_link) == 3:
+                    # save article
+                    better_filter[out_link[0]] = out_link[2]
+                if len(out_link) == 2:
+                  better_filter[out_link[0]] = ""
+                elif len(out_link) == 3:
+                  # save article
+                  better_filter[out_link[0]] = out_link[2]
+              else: filtered[out_link] = ""
+          print("done with link and here is filter: ",filtered)
+    if better_filter:
+      filtered = better_filter
+    return filtered
+def smart_google_search(metadata):
+  queries = smart_google_queries(metadata)
+  links = []
+  for q in queries:
+      #print("\n🔍 Query:", q)
+      results = mtdna_classifier.search_google_custom(q,2)
+      for link in results:
+          #print(f"- {link}")
+          if link not in links:
+              links.append(link)
+  #filter_links = filter_links_by_metadata(links)
+  return links
+# Method 2: Prompt LLM better or better ai search api with all
 # the total information from even ncbi and all search