Spaces:

VyLala
/

mtDNALocation

Running

File size: 48,328 Bytes

# test1: MJ17 direct
# test2: "A1YU101" thailand cross-ref
# test3: "EBK109" thailand cross-ref
# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
import data_preprocess
import model
import mtdna_classifier
#import app
import smart_fallback
import pandas as pd
from pathlib import Path
import subprocess
from NER.html import extractHTML
import os
import google.generativeai as genai
import re
import standardize_location
# Helper functions in for this pipeline
# Track time
import time
import multiprocessing
import gspread
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
from google.oauth2.service_account import Credentials
from oauth2client.service_account import ServiceAccountCredentials
import io
import json
#––– Authentication setup –––
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"])  # from HF secrets
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)

def get_or_create_drive_folder(name, parent_id=None):
    query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
    if parent_id:
        query += f" and '{parent_id}' in parents"
    results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
    items = results.get("files", [])
    if items:
        return items[0]["id"]
    file_metadata = {
        "name": name,
        "mimeType": "application/vnd.google-apps.folder"
    }
    if parent_id:
        file_metadata["parents"] = [parent_id]
    file = drive_service.files().create(body=file_metadata, fields="id").execute()
    return file["id"]
# def find_drive_file(filename, parent_id):
#     """
#     Checks if a file with the given name exists inside the specified Google Drive folder.
#     Returns the file ID if found, else None.
#     """
#     query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
#     results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
#     files = results.get('files', [])
#     if files:
#         return files[0]["id"]
#     return None

def find_drive_file(filename, parent_id):
    """
    Checks if a file with the given name exists inside the specified Google Drive folder.
    Returns the file ID if found, else None.
    """
    try:
        print(f"🔍 Searching for '{filename}' in folder: {parent_id}")
        query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
        results = drive_service.files().list(
            q=query,
            spaces='drive',
            fields='files(id, name)',
            pageSize=1
        ).execute()
        files = results.get('files', [])
        if files:
            print(f"✅ Found file: {files[0]['name']} with ID: {files[0]['id']}")
            return files[0]["id"]
        else:
            print("⚠️ File not found.")
            return None
    except Exception as e:
        print(f"❌ Error during find_drive_file: {e}")
        return None



# def upload_file_to_drive(local_path, remote_name, folder_id):
#     file_metadata = {"name": remote_name, "parents": [folder_id]}
#     media = MediaFileUpload(local_path, resumable=True)
#     existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
#     if existing:
#         drive_service.files().delete(fileId=existing[0]["id"]).execute()
#     file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
#     result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
#     if not result.get("files"):
#         print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.")
#     else:
#         print(f"✅ Verified upload: {remote_name}")
#     return file["id"]
def upload_file_to_drive(local_path, remote_name, folder_id):
    try:
        if not os.path.exists(local_path):
            raise FileNotFoundError(f"❌ Local file does not exist: {local_path}")

        # Delete existing file on Drive if present
        existing = drive_service.files().list(
            q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
            fields="files(id)"
        ).execute().get("files", [])

        if existing:
            drive_service.files().delete(fileId=existing[0]["id"]).execute()
            print(f"🗑️ Deleted existing '{remote_name}' in Drive folder {folder_id}")

        file_metadata = {"name": remote_name, "parents": [folder_id]}
        media = MediaFileUpload(local_path, resumable=True)
        file = drive_service.files().create(
            body=file_metadata,
            media_body=media,
            fields="id"
        ).execute()

        print(f"✅ Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
        return file["id"]

    except Exception as e:
        print(f"❌ Error during upload: {e}")
        return None


def download_file_from_drive(remote_name, folder_id, local_path):
    results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
    files = results.get("files", [])
    if not files:
        return False
    file_id = files[0]["id"]
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.FileIO(local_path, 'wb')
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        _, done = downloader.next_chunk()
    return True
def download_drive_file_content(file_id):
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        _, done = downloader.next_chunk()
    fh.seek(0)
    return fh.read().decode("utf-8")

# def run_with_timeout(func, args=(), kwargs={}, timeout=20):
#     """
#     Runs `func` with timeout in seconds. Kills if it exceeds.
#     Returns: (success, result or None)
#     """
#     def wrapper(q, *args, **kwargs):
#         try:
#             q.put(func(*args, **kwargs))
#         except Exception as e:
#             q.put(e)

#     q = multiprocessing.Queue()
#     p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
#     p.start()
#     p.join(timeout)

#     if p.is_alive():
#         p.terminate()
#         p.join()
#         print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
#         return False, None
#     else:
#         result = q.get()
#         if isinstance(result, Exception):
#             raise result
#         return True, result
# def run_with_timeout(func, args=(), kwargs={}, timeout=30):
#     import concurrent.futures
#     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
#         future = executor.submit(func, *args, **kwargs)
#         try:
#             return True, future.result(timeout=timeout)
#         except concurrent.futures.TimeoutError:
#             print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
#             return False, None

import multiprocessing

def run_with_timeout(func, args=(), kwargs={}, timeout=30):
    def wrapper(q, *args, **kwargs):
        try:
            result = func(*args, **kwargs)
            q.put((True, result))
        except Exception as e:
            q.put((False, e))

    q = multiprocessing.Queue()
    p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
    p.start()
    p.join(timeout)

    if p.is_alive():
        p.terminate()
        p.join()
        print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
        return False, None

    if not q.empty():
        success, result = q.get()
        if success:
            return True, result
        else:
            raise result  # re-raise exception if needed

    return False, None



def time_it(func, *args, **kwargs):
    """
    Measure how long a function takes to run and return its result + time.
    """
    start = time.time()
    result = func(*args, **kwargs)
    end = time.time()
    elapsed = end - start
    print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
    return result, elapsed
# --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---    

def unique_preserve_order(seq):
    seen = set()
    return [x for x in seq if not (x in seen or seen.add(x))]
# Main execution
def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None):
  # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
  # there can be one accession number in the accessions
  # Prices are per 1,000 tokens
  # Before each big step:
  if stop_flag is not None and stop_flag.value:
    print(f"🛑 Stop detected before starting {accession}, aborting early...")
    return {}
  # PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
  # PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
  # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
  # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
  PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
  PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens

  # Embedding-001 pricing per 1,000 input tokens
  PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens  
  if not accessions:
    print("no input")
    return None
  else:  
    accs_output = {}
    #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))  
    for acc in accessions:
      print("start gemini: ", acc)  
      start = time.time()
      total_cost_title = 0
      jsonSM, links, article_text = {},[], ""
      acc_score = { "isolate": "",
                    "country":{},
                   "sample_type":{},
                   #"specific_location":{},
                   #"ethnicity":{},
                   "query_cost":total_cost_title,
                   "time_cost":None,
                   "source":links,
                    "file_chunk":"",
                   "file_all_output":""}
      if niche_cases:
        for niche in niche_cases:
          acc_score[niche] = {}
            
      meta = mtdna_classifier.fetch_ncbi_metadata(acc)
      country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
      acc_score["isolate"] = iso
      print("meta: ",meta)  
      meta_expand = smart_fallback.fetch_ncbi(acc)
      print("meta expand: ", meta_expand)  
      # set up step: create the folder to save document
      chunk, all_output = "",""
      if pudID: 
        id = str(pudID)
        saveTitle = title
      else: 
        try:
          author_name = meta_expand["authors"].split(',')[0]  # Use last name only
        except:
          author_name = meta_expand["authors"] 
        saveTitle = title + "_" + col_date + "_" + author_name
        if title.lower() == "unknown" and col_date.lower()=="unknown" and   author_name.lower() == "unknown":
            saveTitle += "_" + acc
        id = "DirectSubmission"
      # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
      # if not folder_path.exists():
      #     cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
      #     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
      #     print("data/"+str(id) +" created.")
      # else:
      #     print("data/"+str(id) +" already exists.")
      # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
      # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
      # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
      # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
      data_folder_id = GDRIVE_DATA_FOLDER_NAME  # Use the shared folder directly
      sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
      print("sample folder id: ", sample_folder_id)
      
      # Define document names
      if len(saveTitle) > 50:  
          saveName = saveTitle[:50]
          saveName = saveName.replace(" ", "_")
          chunk_filename = f"{saveName}_merged_document.docx"
          all_filename = f"{saveName}_all_merged_document.docx"
      else:
          saveName = saveTitle.replace(" ", "_")
          chunk_filename = f"{saveName}_merged_document.docx"
          all_filename = f"{saveName}_all_merged_document.docx"
      print("chunk file name and all filename: ", chunk_filename, all_filename)  
      # Define local temp paths for reading/writing
      # import tempfile
      # tmp_dir = tempfile.mkdtemp()
      LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
      os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
      file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
      file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
      # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
      # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)  
      if stop_flag is not None and stop_flag.value:
        print(f"🛑 Stop processing {accession}, aborting early...")
        return {}
      print("this is file chunk path: ", file_chunk_path)
      chunk_id = find_drive_file(chunk_filename, sample_folder_id)
      all_id = find_drive_file(all_filename, sample_folder_id)
    
      if chunk_id and all_id:
        print("✅ Files already exist in Google Drive. Downloading them...")
        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
        acc_score["file_chunk"] = str(chunk_filename)
        acc_score["file_all_output"] = str(all_filename)  
        print("chunk_id and all_id: ")
        print(chunk_id, all_id)  
        print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"])  
        file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
        print("📄 Name:", file["name"])
        print("📁 Parent folder ID:", file["parents"][0])
        print("🔗 View link:", file["webViewLink"])
  
  
        # Read and parse these into `chunk` and `all_output`
      else:
        # 🔥 Remove any stale local copies
        if os.path.exists(file_chunk_path):
            os.remove(file_chunk_path)
            print(f"🗑️ Removed stale: {file_chunk_path}")
        if os.path.exists(file_all_path):
            os.remove(file_all_path)
            print(f"🗑️ Removed stale: {file_all_path}")  
      # 🔥 Remove the local file first if it exists
      # if os.path.exists(file_chunk_path):
      #   os.remove(file_chunk_path)
      #   print("remove chunk path")  
      # if os.path.exists(file_all_path):
      #   os.remove(file_all_path)    
      #   print("remove all path")  
      # Try to download if already exists on Drive
        chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
        all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
      print("chunk exist: ", chunk_exists)  
      # first way: ncbi method
      print("country.lower: ",country.lower())  
      if country.lower() != "unknown":
        stand_country = standardize_location.smart_country_lookup(country.lower())
        print("stand_country: ", stand_country)  
        if stand_country.lower() != "not found":
          acc_score["country"][stand_country.lower()] = ["ncbi"]
        else: acc_score["country"][country.lower()] = ["ncbi"]   
      # if spe_loc.lower() != "unknown":
      #   acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
      # if ethnic.lower() != "unknown":
      #   acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
      if sample_type.lower() != "unknown":
        acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
      # second way: LLM model
      # Preprocess the input token
      print(acc_score)  
      accession, isolate = None, None
      if acc != "unknown":  accession = acc
      if iso != "unknown":  isolate = iso
      if stop_flag is not None and stop_flag.value:
        print(f"🛑 Stop processing {accession}, aborting early...")
        return {}    
      # check doi first
      print("chunk filename: ", chunk_filename)    
      if chunk_exists:
        print("File chunk exists!")
        if not chunk:
            print("start to get chunk")
            text, table, document_title = model.read_docx_text(file_chunk_path)
            chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
        if str(chunk_filename) != "":
            print("first time have chunk path at chunk exist: ", str(chunk_filename))
            acc_score["file_chunk"] = str(chunk_filename)    
      if all_exists:
        print("File all output exists!")
        if not all_output:
            text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
            all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
        if str(all_filename) != "":
            print("first time have all path at all exist: ", str(all_filename))
            acc_score["file_all_output"] = str(all_filename)    
      print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])  
      if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:  
          if doi != "unknown":
            link = 'https://doi.org/' + doi
            # get the file to create listOfFile for each id
            print("link of doi: ", link)  
            html = extractHTML.HTML("",link)
            jsonSM = html.getSupMaterial()
            article_text = html.getListSection()
            if article_text:
              if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
                links.append(link)
            if jsonSM:
              links += sum((jsonSM[key] for key in jsonSM),[])
          # no doi then google custom search api
          if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
            # might find the article
            print("no article text, start tem link")  
            #tem_links = mtdna_classifier.search_google_custom(title, 2)
            tem_links = smart_fallback.smart_google_search(meta_expand)
            print("tem links: ", tem_links)  
            tem_link_acc = smart_fallback.google_accession_search(acc)
            tem_links += tem_link_acc 
            tem_links = unique_preserve_order(tem_links)
            print("tem link before filtering: ", tem_links)  
            # filter the quality link  
            print("saveLinkFolder as sample folder id: ", sample_folder_id)  
            print("start the smart filter link") 
            if stop_flag is not None and stop_flag.value:
                print(f"🛑 Stop processing {accession}, aborting early...")
                return {}  
            # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
            # if success_process:
            #   links = output_process
            #   print("yes succeed for smart filter link")
            # else: 
            #   print("no suceed, fallback to all tem links")
            #   links = tem_links
            links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
          print("this is links: ",links)    
          links = unique_preserve_order(links)
          acc_score["source"] = links
      else:
        print("inside the try of reusing chunk or all output")  
        #print("chunk filename: ", str(chunks_filename))  
          
        try:
            temp_source = False
            if save_df is not None and not save_df.empty:
                print("save df not none")  
                print("chunk file name: ",str(chunk_filename))
                print("all filename: ",str(all_filename))
                if acc_score["file_chunk"]:
                  link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0]
                  #link = row["Sources"].iloc[0]
                  if "http" in link:   
                    print("yeah http in save df source") 
                    acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
                  else:  # temporary  
                    print("tempo source") 
                    #acc_score["source"] = [str(all_filename), str(chunks_filename)]
                    temp_source = True
                elif acc_score["file_all_output"]:
                  link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0]
                  #link = row["Sources"].iloc[0]
                  print(link)
                  print("list of link")
                  print([x for x in link.split("\n") if x.strip()])
                  if "http" in link:    
                    print("yeah http in save df source")
                    acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()   
                  else:  # temporary  
                    print("tempo source") 
                    #acc_score["source"] = [str(all_filename), str(chunks_filename)]
                    temp_source = True      
                else:  # temporary  
                  print("tempo source") 
                  #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]  
                  temp_source = True
            else:  # temporary  
                print("tempo source") 
                  #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]  
                temp_source = True      
            if temp_source:
                print("temp source is true so have to try again search link")
                if doi != "unknown":
                    link = 'https://doi.org/' + doi
                    # get the file to create listOfFile for each id
                    print("link of doi: ", link)  
                    html = extractHTML.HTML("",link)
                    jsonSM = html.getSupMaterial()
                    article_text = html.getListSection()
                    if article_text:
                      if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
                        links.append(link)
                    if jsonSM:
                      links += sum((jsonSM[key] for key in jsonSM),[])
                  # no doi then google custom search api
                if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
                    # might find the article
                    print("no article text, start tem link")  
                    #tem_links = mtdna_classifier.search_google_custom(title, 2)
                    tem_links = smart_fallback.smart_google_search(meta_expand)
                    print("tem links: ", tem_links)  
                    tem_link_acc = smart_fallback.google_accession_search(acc)
                    tem_links += tem_link_acc 
                    tem_links = unique_preserve_order(tem_links)
                    print("tem link before filtering: ", tem_links)  
                    # filter the quality link  
                    print("saveLinkFolder as sample folder id: ", sample_folder_id)  
                    print("start the smart filter link") 
                    if stop_flag is not None and stop_flag.value:
                        print(f"🛑 Stop processing {accession}, aborting early...")
                        return {}  
                    # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
                    # if success_process:
                    #   links = output_process
                    #   print("yes succeed for smart filter link")
                    # else: 
                    #   print("no suceed, fallback to all tem links")
                    #   links = tem_links
                    links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
                print("this is links: ",links)    
                links = unique_preserve_order(links)
                acc_score["source"] = links
        except:
            print("except for source")  
            acc_score["source"] = []
      # chunk_path = "/"+saveTitle+"_merged_document.docx"
      # all_path = "/"+saveTitle+"_all_merged_document.docx"
      # # if chunk and all output not exist yet
      # file_chunk_path = saveLinkFolder + chunk_path
      # file_all_path = saveLinkFolder + all_path
      # if os.path.exists(file_chunk_path):
      #   print("File chunk exists!")
      #   if not chunk:
      #     text, table, document_title = model.read_docx_text(file_chunk_path)
      #     chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
      # if os.path.exists(file_all_path):
      #   print("File all output exists!")
      #   if not all_output:
      #     text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
      #     all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
      if stop_flag is not None and stop_flag.value:
        print(f"🛑 Stop processing {accession}, aborting early...")
        return {}
      # print("chunk filename: ", chunk_filename)    
      # if chunk_exists:
      #   print("File chunk exists!")
      #   if not chunk:
      #       print("start to get chunk")
      #       text, table, document_title = model.read_docx_text(file_chunk_path)
      #       chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
      #   if str(chunk_filename) != "":
      #       print("first time have chunk path at chunk exist: ", str(chunk_filename))
      #       acc_score["file_chunk"] = str(chunk_filename)    
      # if all_exists:
      #   print("File all output exists!")
      #   if not all_output:
      #       text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
      #       all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
      #   if str(all_filename) != "":
      #       print("first time have all path at all exist: ", str(all_filename))
      #       acc_score["file_all_output"] = str(all_filename)    
      if not chunk and not all_output:
        print("not chunk and all output")  
        # else: check if we can reuse these chunk and all output of existed accession to find another
        if str(chunk_filename) != "":
          print("first time have chunk path: ", str(chunk_filename))
          acc_score["file_chunk"] = str(chunk_filename)
        if str(all_filename) != "":
          print("first time have all path: ", str(all_filename))
          acc_score["file_all_output"] = str(all_filename)    
        if links:
          for link in links:
              print(link)
              # if len(all_output) > 1000*1000:
              #   all_output = data_preprocess.normalize_for_overlap(all_output)
              #   print("after normalizing all output: ", len(all_output))
              if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
                print("break here")
                break
              if iso != "unknown": query_kw = iso
              else: query_kw = acc
              #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
              success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100)
              if stop_flag is not None and stop_flag.value:
                print(f"🛑 Stop processing {accession}, aborting early...")
                return {}
              if success_process:
                text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
                print("yes succeed for process document")
              else: text_link, tables_link, final_input_link = "", "", ""  
              context = data_preprocess.extract_context(final_input_link, query_kw)
              if context !=  "Sample ID not found.":
                if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
                  success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
                  if stop_flag is not None and stop_flag.value:
                    print(f"🛑 Stop processing {accession}, aborting early...")
                    return {}
                  if success_chunk:
                    chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
                    print("yes succeed for chunk")
                  else:
                    chunk += context
                    print("len context: ", len(context))
                    print("basic fall back")
                print("len chunk after: ", len(chunk))
              if len(final_input_link) > 1000*1000:
                if context !=  "Sample ID not found.":
                  final_input_link =  context 
                else:
                  final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
                  if len(final_input_link) > 1000 *1000:
                    final_input_link = final_input_link[:100000] 
              if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000:
                print("Running merge_texts_skipping_overlap with timeout")
                success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30)
                if stop_flag is not None and stop_flag.value:
                  print(f"🛑 Stop processing {accession}, aborting early...")
                  return {}
                print("Returned from timeout logic")
                if success:
                  all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
                  print("yes succeed")
                else:
                  print("len all output: ", len(all_output))
                  print("len final input link: ", len(final_input_link))
                  all_output += final_input_link
                  print("len final input: ", len(final_input_link))
                  print("basic fall back")
              else:
                  print("both/either all output or final link too large more than 100000")
                  print("len all output: ", len(all_output))
                  print("len final input link: ", len(final_input_link))
                  all_output += final_input_link
                  print("len final input: ", len(final_input_link))
                  print("basic fall back")    
              print("len all output after: ", len(all_output))
          #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
        if stop_flag is not None and stop_flag.value:
          print(f"🛑 Stop processing {accession}, aborting early...")
          return {}  
        else:
          chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
          all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
        if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
        if not all_output:  all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
        if len(all_output) > 1*1024*1024: 
          all_output = data_preprocess.normalize_for_overlap(all_output)
          if len(all_output) > 1*1024*1024:
            all_output = all_output[:1*1024*1024]
        print("chunk len: ", len(chunk))
        print("all output len: ", len(all_output))    
        data_preprocess.save_text_to_docx(chunk, file_chunk_path)
        data_preprocess.save_text_to_docx(all_output, file_all_path)
        # Later when saving new files
        # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
        # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
        
        # Upload to Drive
        result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
        result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
        print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
        print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")  
        print("here 1")
          
      # else:
      #   final_input = ""
      #   if all_output:
      #     final_input = all_output
      #   else:  
      #     if chunk: final_input = chunk
      #   #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
      #   if final_input:
      #     keywords = []
      #     if iso != "unknown":  keywords.append(iso)
      #     if acc != "unknown":  keywords.append(acc)
      #     for keyword in keywords:
      #       chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
      #       countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
      #       chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
          
      # Define paths for cached RAG assets
      # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
      # document_chunks_path = saveLinkFolder+"/document_chunks.json"
      # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
      print("here 2")
      faiss_filename = "faiss_index.bin"
      chunks_filename = "document_chunks.json"
      lookup_filename = "structured_lookup.json"
      print("name of faiss: ", faiss_filename)  
          
      faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
      document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
      structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
      print("name if faiss path: ", faiss_index_path)  
      # 🔥 Remove the local file first if it exists
      print("start faiss id and also the sample folder id is: ", sample_folder_id)  
      faiss_id = find_drive_file(faiss_filename, sample_folder_id)
      print("done faiss id")  
      document_id = find_drive_file(chunks_filename, sample_folder_id)
      structure_id = find_drive_file(lookup_filename, sample_folder_id)  
      if faiss_id and document_id and structure_id:
        print("✅ 3 Files already exist in Google Drive. Downloading them...")
        download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
        download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
        download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)  
        # Read and parse these into `chunk` and `all_output`
      else:
        "one of id not exist"  
        if os.path.exists(faiss_index_path):
            print("faiss index exist and start to remove: ", faiss_index_path)
            os.remove(faiss_index_path)
        if os.path.exists(document_chunks_path):
            os.remove(document_chunks_path)
        if os.path.exists(structured_lookup_path):
            os.remove(structured_lookup_path)    
        print("start to download the faiss, chunk, lookup")
            
        download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
        download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
        download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
      try:
          print("try gemini 2.5")
          print("move to load rag")
          master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
              faiss_index_path, document_chunks_path, structured_lookup_path
          )
    
          global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
          if not all_output:
            if chunk: all_output = chunk
            else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
          if faiss_index is None:
              print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
              total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
                  all_output
              ).total_tokens
    
              initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
              total_cost_title += initial_embedding_cost
              print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
    
    
              master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
                  file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
              )
          else:
              print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
              plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
              master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
          if stop_flag is not None and stop_flag.value:
            print(f"🛑 Stop processing {accession}, aborting early...")
            return {}  
          primary_word = iso
          alternative_word = acc
          print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
          if features.lower() not in all_output.lower():  
            all_output += ". NCBI Features: " + features
          # country, sample_type, method_used, ethnic, spe_loc, total_query_cost =  model.query_document_info(
          #     primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
          #     model.call_llm_api, chunk=chunk, all_output=all_output)
          print("this is chunk for the model")
          print(chunk)
          print("this is all output for the model")
          print(all_output)  
          if stop_flag is not None and stop_flag.value:
            print(f"🛑 Stop processing {accession}, aborting early...")
            return {}
          country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
              primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
              model.call_llm_api, chunk=chunk, all_output=all_output)
          print("pass query of 2.5")
      except:
          print("try gemini 1.5")
          country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info(
            primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
            model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest")      
          print("yeah pass the query of 1.5")
      print("country using ai: ", country)
      print("sample type using ai: ", sample_type)  
      # if len(country) == 0: country = "unknown"
      # if len(sample_type) == 0: sample_type = "unknown"    
      # if country_explanation: country_explanation = "-"+country_explanation        
      # else: country_explanation = ""
      # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
      # else: sample_type_explanation = ""
      if len(country) == 0: country = "unknown"
      if len(sample_type) == 0: sample_type = "unknown"    
      if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation        
      else: country_explanation = ""
      if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
      else: sample_type_explanation = ""
          
      if method_used == "unknown": method_used = ""
      if country.lower() != "unknown":
        stand_country = standardize_location.smart_country_lookup(country.lower())
        if stand_country.lower() != "not found":
          if stand_country.lower() in acc_score["country"]:
            if country_explanation:
              acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
          else:
            acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
        else:
          if country.lower() in acc_score["country"]:
            if country_explanation:
              if len(method_used + country_explanation) > 0:
                acc_score["country"][country.lower()].append(method_used + country_explanation)
          else:
            if len(method_used + country_explanation) > 0:
              acc_score["country"][country.lower()] = [method_used + country_explanation]
      # if spe_loc.lower() != "unknown":
      #   if spe_loc.lower() in acc_score["specific_location"]:
      #     acc_score["specific_location"][spe_loc.lower()].append(method_used)
      #   else:
      #     acc_score["specific_location"][spe_loc.lower()] = [method_used]
      # if ethnic.lower() != "unknown":
      #   if ethnic.lower() in acc_score["ethnicity"]:
      #     acc_score["ethnicity"][ethnic.lower()].append(method_used)
      #   else:
      #     acc_score["ethnicity"][ethnic.lower()] = [method_used]
      if sample_type.lower() != "unknown":
        if sample_type.lower() in acc_score["sample_type"]:
          if len(method_used + sample_type_explanation) > 0:
            acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
        else:
          if len(method_used + sample_type_explanation)> 0:
            acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
      total_cost_title += total_query_cost
      if stop_flag is not None and stop_flag.value:
        print(f"🛑 Stop processing {accession}, aborting early...")
        return {}
      # last resort: combine all information to give all output otherwise unknown
      if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown":   
        text = ""
        for key in meta_expand:
          text += str(key) + ": " + meta_expand[key] + "\n"    
        if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
          text += data_preprocess.normalize_for_overlap(all_output)          
        if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
          text += data_preprocess.normalize_for_overlap(chunk)            
        text += ". NCBI Features: " + features 
        print("this is text for the last resort model")
        print(text)  
        country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
            primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks, 
            model.call_llm_api, chunk=text, all_output=text)  
        print("this is last resort results: ")
        print("country: ", country)
        print("sample type: ", sample_type)  
        if len(country) == 0: country = "unknown"
        if len(sample_type) == 0: sample_type = "unknown"    
        # if country_explanation: country_explanation = "-"+country_explanation        
        # else: country_explanation = ""
        # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
        # else: sample_type_explanation = ""
        if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation        
        else: country_explanation = ""
        if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
        else: sample_type_explanation = ""
            
        if method_used == "unknown": method_used = ""
        if country.lower() != "unknown":
          stand_country = standardize_location.smart_country_lookup(country.lower())
          if stand_country.lower() != "not found":
            if stand_country.lower() in acc_score["country"]:
              if country_explanation:
                acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
            else:
              acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
          else:
            if country.lower() in acc_score["country"]:
              if country_explanation:
                if len(method_used + country_explanation) > 0:
                  acc_score["country"][country.lower()].append(method_used + country_explanation)
            else:
              if len(method_used + country_explanation) > 0:
                acc_score["country"][country.lower()] = [method_used + country_explanation]
        if sample_type.lower() != "unknown":
            if sample_type.lower() in acc_score["sample_type"]:
              if len(method_used + sample_type_explanation) > 0:
                acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
            else:
              if len(method_used + sample_type_explanation)> 0:
                acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]          
        total_cost_title += total_query_cost
      end = time.time()
      #total_cost_title += total_query_cost
      acc_score["query_cost"] = f"{total_cost_title:.6f}"
      elapsed = end - start
      acc_score["time_cost"] = f"{elapsed:.3f} seconds"
      accs_output[acc] = acc_score
      print(accs_output[acc])
      
    return accs_output