Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

mtDNALocation / mtdna_backend.py

VyLala

Update mtdna_backend.py

9be33a8 verified 2 months ago

raw

history blame

43 kB

	import gradio as gr
	from collections import Counter
	import csv
	import os
	from functools import lru_cache
	#import app
	from mtdna_classifier import classify_sample_location
	import data_preprocess, model, pipeline
	import subprocess
	import json
	import pandas as pd
	import io
	import re
	import tempfile
	import gspread
	from oauth2client.service_account import ServiceAccountCredentials
	from io import StringIO
	import hashlib
	import threading

	# @lru_cache(maxsize=3600)
	# def classify_sample_location_cached(accession):
	# return classify_sample_location(accession)

	#@lru_cache(maxsize=3600)
	def pipeline_classify_sample_location_cached(accession,stop_flag=None, save_df=None):
	print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
	print("len of save df: ", len(save_df))
	return pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag, save_df=save_df)

	# Count and suggest final location
	# def compute_final_suggested_location(rows):
	# candidates = [
	# row.get("Predicted Location", "").strip()
	# for row in rows
	# if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
	# ] + [
	# row.get("Inferred Region", "").strip()
	# for row in rows
	# if row.get("Inferred Region", "").strip().lower() not in ["", "sample id not found", "unknown"]
	# ]

	# if not candidates:
	# return Counter(), ("Unknown", 0)
	# # Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
	# tokens = []
	# for item in candidates:
	# # Split by comma, whitespace, and newlines
	# parts = re.split(r'[\s,]+', item)
	# tokens.extend(parts)

	# # Step 2: Clean and normalize tokens
	# tokens = [word.strip() for word in tokens if word.strip().isalpha()] # Keep only alphabetic tokens

	# # Step 3: Count
	# counts = Counter(tokens)

	# # Step 4: Get most common
	# top_location, count = counts.most_common(1)[0]
	# return counts, (top_location, count)

	# Store feedback (with required fields)

	def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
	if not answer1.strip() or not answer2.strip():
	return "⚠️ Please answer both questions before submitting."

	try:
	# ✅ Step: Load credentials from Hugging Face secret
	creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
	creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)

	# Connect to Google Sheet
	client = gspread.authorize(creds)
	sheet = client.open("feedback_mtdna").sheet1 # make sure sheet name matches

	# Append feedback
	sheet.append_row([accession, answer1, answer2, contact])
	return "✅ Feedback submitted. Thank you!"

	except Exception as e:
	return f"❌ Error submitting feedback: {e}"

	import re

	ACCESSION_REGEX = re.compile(r'^[A-Z]{1,4}_?\d{6}(\.\d+)?$')

	def is_valid_accession(acc):
	return bool(ACCESSION_REGEX.match(acc))

	# helper function to extract accessions
	def extract_accessions_from_input(file=None, raw_text=""):
	print(f"RAW TEXT RECEIVED: {raw_text}")
	accessions, invalid_accessions = [], []
	seen = set()
	if file:
	try:
	if file.name.endswith(".csv"):
	df = pd.read_csv(file)
	elif file.name.endswith(".xlsx"):
	df = pd.read_excel(file)
	else:
	return [], "Unsupported file format. Please upload CSV or Excel."
	for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
	if acc not in seen:
	if is_valid_accession(acc):
	accessions.append(acc)
	seen.add(acc)
	else:
	invalid_accessions.append(acc)

	except Exception as e:
	return [],[], f"Failed to read file: {e}"

	if raw_text:
	try:
	text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
	for acc in text_ids:
	if acc not in seen:
	if is_valid_accession(acc):
	accessions.append(acc)
	seen.add(acc)
	else:
	invalid_accessions.append(acc)
	except Exception as e:
	return [],[], f"Failed to read file: {e}"

	return list(accessions), list(invalid_accessions), None
	# ✅ Add a new helper to backend: `filter_unprocessed_accessions()`
	def get_incomplete_accessions(file_path):
	df = pd.read_excel(file_path)

	incomplete_accessions = []
	for _, row in df.iterrows():
	sample_id = str(row.get("Sample ID", "")).strip()

	# Skip if no sample ID
	if not sample_id:
	continue

	# Drop the Sample ID and check if the rest is empty
	other_cols = row.drop(labels=["Sample ID"], errors="ignore")
	if other_cols.isna().all() or (other_cols.astype(str).str.strip() == "").all():
	# Extract the accession number from the sample ID using regex
	match = re.search(r"\b[A-Z]{2,4}\d{4,}", sample_id)
	if match:
	incomplete_accessions.append(match.group(0))
	print(len(incomplete_accessions))
	return incomplete_accessions

	# GOOGLE_SHEET_NAME = "known_samples"
	# USAGE_DRIVE_FILENAME = "user_usage_log.json"

	def summarize_results(accession, stop_flag=None):
	# Early bail
	if stop_flag is not None and stop_flag.value:
	print(f"🛑 Skipping {accession} before starting.")
	return []
	# try cache first
	cached = check_known_output(accession)
	if cached:
	print(f"✅ Using cached result for {accession}")
	return [[
	cached["Sample ID"] or "unknown",
	cached["Predicted Country"] or "unknown",
	cached["Country Explanation"] or "unknown",
	cached["Predicted Sample Type"] or "unknown",
	cached["Sample Type Explanation"] or "unknown",
	cached["Sources"] or "No Links",
	cached["Time cost"]
	]]
	# only run when nothing in the cache
	try:
	print("try gemini pipeline: ",accession)
	# ✅ Load credentials from Hugging Face secret
	creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	client = gspread.authorize(creds)

	spreadsheet = client.open("known_samples")
	sheet = spreadsheet.sheet1

	data = sheet.get_all_values()
	if not data:
	print("⚠️ Google Sheet 'known_samples' is empty.")
	return None

	save_df = pd.DataFrame(data[1:], columns=data[0])
	print("before pipeline, len of save df: ", len(save_df))
	outputs = pipeline_classify_sample_location_cached(accession, stop_flag, save_df)
	if stop_flag is not None and stop_flag.value:
	print(f"🛑 Skipped {accession} mid-pipeline.")
	return []
	# outputs = {'KU131308': {'isolate':'BRU18',
	# 'country': {'brunei': ['ncbi',
	# 'rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
	# 'sample_type': {'modern':
	# ['rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
	# 'query_cost': 9.754999999999999e-05,
	# 'time_cost': '24.776 seconds',
	# 'source': ['https://doi.org/10.1007/s00439-015-1620-z',
	# 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf',
	# 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls']}}
	except Exception as e:
	return []#, f"Error: {e}", f"Error: {e}", f"Error: {e}"

	if accession not in outputs:
	print("no accession in output ", accession)
	return []#, "Accession not found in results.", "Accession not found in results.", "Accession not found in results."

	row_score = []
	rows = []
	save_rows = []
	for key in outputs:
	pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
	for section, results in outputs[key].items():
	if section == "country" or section =="sample_type":
	pred_output = []#"\n".join(list(results.keys()))
	output_explanation = ""
	for result, content in results.items():
	if len(result) == 0: result = "unknown"
	if len(content) == 0: output_explanation = "unknown"
	else:
	output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
	pred_output.append(result)
	pred_output = "\n".join(pred_output)
	if section == "country":
	pred_country, country_explanation = pred_output, output_explanation
	elif section == "sample_type":
	pred_sample, sample_explanation = pred_output, output_explanation
	if outputs[key]["isolate"].lower()!="unknown":
	label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
	else: label = key
	if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
	row = {
	"Sample ID": label or "unknown",
	"Predicted Country": pred_country or "unknown",
	"Country Explanation": country_explanation or "unknown",
	"Predicted Sample Type":pred_sample or "unknown",
	"Sample Type Explanation":sample_explanation or "unknown",
	"Sources": "\n".join(outputs[key]["source"]) or "No Links",
	"Time cost": outputs[key]["time_cost"]
	}
	#row_score.append(row)
	rows.append(list(row.values()))

	save_row = {
	"Sample ID": label or "unknown",
	"Predicted Country": pred_country or "unknown",
	"Country Explanation": country_explanation or "unknown",
	"Predicted Sample Type":pred_sample or "unknown",
	"Sample Type Explanation":sample_explanation or "unknown",
	"Sources": "\n".join(outputs[key]["source"]) or "No Links",
	"Query_cost": outputs[key]["query_cost"] or "",
	"Time cost": outputs[key]["time_cost"] or "",
	"file_chunk":outputs[key]["file_chunk"] or "",
	"file_all_output":outputs[key]["file_all_output"] or ""
	}
	#row_score.append(row)
	save_rows.append(list(save_row.values()))

	# #location_counts, (final_location, count) = compute_final_suggested_location(row_score)
	# summary_lines = [f"### 🧭 Location Summary:\n"]
	# summary_lines += [f"- {loc}: {cnt} times" for loc, cnt in location_counts.items()]
	# summary_lines.append(f"\nFinal Suggested Location: 🗺️ {final_location} (mentioned {count} times)")
	# summary = "\n".join(summary_lines)

	# save the new running sample to known excel file
	# try:
	# df_new = pd.DataFrame(save_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Query_cost","Time cost"])
	# if os.path.exists(KNOWN_OUTPUT_PATH):
	# df_old = pd.read_excel(KNOWN_OUTPUT_PATH)
	# df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
	# else:
	# df_combined = df_new
	# df_combined.to_excel(KNOWN_OUTPUT_PATH, index=False)
	# except Exception as e:
	# print(f"⚠️ Failed to save known output: {e}")
	# try:
	# df_new = pd.DataFrame(save_rows, columns=[
	# "Sample ID", "Predicted Country", "Country Explanation",
	# "Predicted Sample Type", "Sample Type Explanation",
	# "Sources", "Query_cost", "Time cost"
	# ])

	# # ✅ Google Sheets API setup
	# creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	# scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	# creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	# client = gspread.authorize(creds)

	# # ✅ Open the known_samples sheet
	# spreadsheet = client.open("known_samples") # Replace with your sheet name
	# sheet = spreadsheet.sheet1

	# # ✅ Read old data
	# existing_data = sheet.get_all_values()
	# if existing_data:
	# df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
	# else:
	# df_old = pd.DataFrame(columns=df_new.columns)

	# # ✅ Combine and remove duplicates
	# df_combined = pd.concat([df_old, df_new], ignore_index=True).drop_duplicates(subset="Sample ID")

	# # ✅ Clear and write back
	# sheet.clear()
	# sheet.update([df_combined.columns.values.tolist()] + df_combined.values.tolist())

	# except Exception as e:
	# print(f"⚠️ Failed to save known output to Google Sheets: {e}")
	try:
	# Prepare as DataFrame
	df_new = pd.DataFrame(save_rows, columns=[
	"Sample ID", "Predicted Country", "Country Explanation",
	"Predicted Sample Type", "Sample Type Explanation",
	"Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
	])

	# ✅ Setup Google Sheets
	creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	client = gspread.authorize(creds)
	spreadsheet = client.open("known_samples")
	sheet = spreadsheet.sheet1

	# ✅ Read existing data
	existing_data = sheet.get_all_values()

	if existing_data:
	df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])

	else:

	df_old = pd.DataFrame(columns=[
	"Sample ID", "Actual_country", "Actual_sample_type", "Country Explanation",
	"Match_country", "Match_sample_type", "Predicted Country", "Predicted Sample Type",
	"Query_cost", "Sample Type Explanation", "Sources", "Time cost", "file_chunk", "file_all_output"
	])


	# ✅ Index by Sample ID
	df_old.set_index("Sample ID", inplace=True)
	df_new.set_index("Sample ID", inplace=True)

	# ✅ Update only matching fields
	update_columns = [
	"Predicted Country", "Predicted Sample Type", "Country Explanation",
	"Sample Type Explanation", "Sources", "Query_cost", "Time cost", "file_chunk", "file_all_output"
	]
	for idx, row in df_new.iterrows():
	if idx not in df_old.index:
	df_old.loc[idx] = "" # new row, fill empty first
	for col in update_columns:
	if pd.notna(row[col]) and row[col] != "":
	df_old.at[idx, col] = row[col]

	# ✅ Reset and write back
	df_old.reset_index(inplace=True)
	sheet.clear()
	sheet.update([df_old.columns.values.tolist()] + df_old.values.tolist())
	print("✅ Match results saved to known_samples.")

	except Exception as e:
	print(f"❌ Failed to update known_samples: {e}")


	return rows#, summary, labelAncient_Modern, explain_label

	# save the batch input in excel file
	# def save_to_excel(all_rows, summary_text, flag_text, filename):
	# with pd.ExcelWriter(filename) as writer:
	# # Save table
	# df_new = pd.DataFrame(all_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
	# df.to_excel(writer, sheet_name="Detailed Results", index=False)
	# try:
	# df_old = pd.read_excel(filename)
	# except:
	# df_old = pd.DataFrame([[]], columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
	# df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
	# # if os.path.exists(filename):
	# # df_old = pd.read_excel(filename)
	# # df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
	# # else:
	# # df_combined = df_new
	# df_combined.to_excel(filename, index=False)
	# # # Save summary
	# # summary_df = pd.DataFrame({"Summary": [summary_text]})
	# # summary_df.to_excel(writer, sheet_name="Summary", index=False)

	# # # Save flag
	# # flag_df = pd.DataFrame({"Flag": [flag_text]})
	# # flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
	# def save_to_excel(all_rows, summary_text, flag_text, filename):
	# df_new = pd.DataFrame(all_rows, columns=[
	# "Sample ID", "Predicted Country", "Country Explanation",
	# "Predicted Sample Type", "Sample Type Explanation",
	# "Sources", "Time cost"
	# ])

	# try:
	# if os.path.exists(filename):
	# df_old = pd.read_excel(filename)
	# else:
	# df_old = pd.DataFrame(columns=df_new.columns)
	# except Exception as e:
	# print(f"⚠️ Warning reading old Excel file: {e}")
	# df_old = pd.DataFrame(columns=df_new.columns)

	# #df_combined = pd.concat([df_new, df_old], ignore_index=True).drop_duplicates(subset="Sample ID", keep="first")
	# df_old.set_index("Sample ID", inplace=True)
	# df_new.set_index("Sample ID", inplace=True)

	# df_old.update(df_new) # <-- update matching rows in df_old with df_new content

	# df_combined = df_old.reset_index()

	# try:
	# df_combined.to_excel(filename, index=False)
	# except Exception as e:
	# print(f"❌ Failed to write Excel file {filename}: {e}")
	def save_to_excel(all_rows, summary_text, flag_text, filename, is_resume=False):
	df_new = pd.DataFrame(all_rows, columns=[
	"Sample ID", "Predicted Country", "Country Explanation",
	"Predicted Sample Type", "Sample Type Explanation",
	"Sources", "Time cost"
	])

	if is_resume and os.path.exists(filename):
	try:
	df_old = pd.read_excel(filename)
	except Exception as e:
	print(f"⚠️ Warning reading old Excel file: {e}")
	df_old = pd.DataFrame(columns=df_new.columns)

	# Set index and update existing rows
	df_old.set_index("Sample ID", inplace=True)
	df_new.set_index("Sample ID", inplace=True)
	df_old.update(df_new)

	df_combined = df_old.reset_index()
	else:
	# If not resuming or file doesn't exist, just use new rows
	df_combined = df_new

	try:
	df_combined.to_excel(filename, index=False)
	except Exception as e:
	print(f"❌ Failed to write Excel file {filename}: {e}")


	# save the batch input in JSON file
	def save_to_json(all_rows, summary_text, flag_text, filename):
	output_dict = {
	"Detailed_Results": all_rows#, # <-- make sure this is a plain list, not a DataFrame
	# "Summary_Text": summary_text,
	# "Ancient_Modern_Flag": flag_text
	}

	# If all_rows is a DataFrame, convert it
	if isinstance(all_rows, pd.DataFrame):
	output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")

	with open(filename, "w") as external_file:
	json.dump(output_dict, external_file, indent=2)

	# save the batch input in Text file
	def save_to_txt(all_rows, summary_text, flag_text, filename):
	if isinstance(all_rows, pd.DataFrame):
	detailed_results = all_rows.to_dict(orient="records")
	output = ""
	#output += ",".join(list(detailed_results[0].keys())) + "\n\n"
	output += ",".join([str(k) for k in detailed_results[0].keys()]) + "\n\n"
	for r in detailed_results:
	output += ",".join([str(v) for v in r.values()]) + "\n\n"
	with open(filename, "w") as f:
	f.write("=== Detailed Results ===\n")
	f.write(output + "\n")

	# f.write("\n=== Summary ===\n")
	# f.write(summary_text + "\n")

	# f.write("\n=== Ancient/Modern Flag ===\n")
	# f.write(flag_text + "\n")

	def save_batch_output(all_rows, output_type, summary_text=None, flag_text=None):
	tmp_dir = tempfile.mkdtemp()

	#html_table = all_rows.value # assuming this is stored somewhere

	# Parse back to DataFrame
	#all_rows = pd.read_html(all_rows)[0] # [0] because read_html returns a list
	all_rows = pd.read_html(StringIO(all_rows))[0]
	print(all_rows)

	if output_type == "Excel":
	file_path = f"{tmp_dir}/batch_output.xlsx"
	save_to_excel(all_rows, summary_text, flag_text, file_path)
	elif output_type == "JSON":
	file_path = f"{tmp_dir}/batch_output.json"
	save_to_json(all_rows, summary_text, flag_text, file_path)
	print("Done with JSON")
	elif output_type == "TXT":
	file_path = f"{tmp_dir}/batch_output.txt"
	save_to_txt(all_rows, summary_text, flag_text, file_path)
	else:
	return gr.update(visible=False) # invalid option

	return gr.update(value=file_path, visible=True)
	# save cost by checking the known outputs

	# def check_known_output(accession):
	# if not os.path.exists(KNOWN_OUTPUT_PATH):
	# return None

	# try:
	# df = pd.read_excel(KNOWN_OUTPUT_PATH)
	# match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
	# if match:
	# accession = match.group(0)

	# matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
	# if not matched.empty:
	# return matched.iloc[0].to_dict() # Return the cached row
	# except Exception as e:
	# print(f"⚠️ Failed to load known samples: {e}")
	# return None

	# def check_known_output(accession):
	# try:
	# # ✅ Load credentials from Hugging Face secret
	# creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	# scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	# creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	# client = gspread.authorize(creds)

	# # ✅ Open the known_samples sheet
	# spreadsheet = client.open("known_samples") # Replace with your sheet name
	# sheet = spreadsheet.sheet1

	# # ✅ Read all rows
	# data = sheet.get_all_values()
	# if not data:
	# return None

	# df = pd.DataFrame(data[1:], columns=data[0]) # Skip header row

	# # ✅ Normalize accession pattern
	# match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
	# if match:
	# accession = match.group(0)

	# matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
	# if not matched.empty:
	# return matched.iloc[0].to_dict()

	# except Exception as e:
	# print(f"⚠️ Failed to load known samples from Google Sheets: {e}")
	# return None
	# def check_known_output(accession):
	# print("inside check known output function")
	# try:
	# # ✅ Load credentials from Hugging Face secret
	# creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	# scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	# creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	# client = gspread.authorize(creds)

	# spreadsheet = client.open("known_samples")
	# sheet = spreadsheet.sheet1

	# data = sheet.get_all_values()
	# if not data:
	# print("⚠️ Google Sheet 'known_samples' is empty.")
	# return None

	# df = pd.DataFrame(data[1:], columns=data[0])
	# if "Sample ID" not in df.columns:
	# print("❌ Column 'Sample ID' not found in Google Sheet.")
	# return None

	# match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
	# if match:
	# accession = match.group(0)

	# matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
	# if not matched.empty:
	# #return matched.iloc[0].to_dict()
	# row = matched.iloc[0]
	# country = row.get("Predicted Country", "").strip().lower()
	# sample_type = row.get("Predicted Sample Type", "").strip().lower()

	# if country and country != "unknown" and sample_type and sample_type != "unknown":
	# return row.to_dict()
	# else:
	# print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
	# return None
	# else:
	# print(f"🔍 Accession {accession} not found in known_samples.")
	# return None

	# except Exception as e:
	# import traceback
	# print("❌ Exception occurred during check_known_output:")
	# traceback.print_exc()
	# return None

	import os
	import re
	import json
	import time
	import gspread
	import pandas as pd
	from oauth2client.service_account import ServiceAccountCredentials
	from gspread.exceptions import APIError

	# --- Global cache ---
	_known_samples_cache = None

	def load_known_samples():
	"""Load the Google Sheet 'known_samples' into a Pandas DataFrame and cache it."""
	global _known_samples_cache
	try:
	creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	scope = [
	'https://spreadsheets.google.com/feeds',
	'https://www.googleapis.com/auth/drive'
	]
	creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	client = gspread.authorize(creds)

	sheet = client.open("known_samples").sheet1
	data = sheet.get_all_values()

	if not data:
	print("⚠️ Google Sheet 'known_samples' is empty.")
	_known_samples_cache = pd.DataFrame()
	else:
	_known_samples_cache = pd.DataFrame(data[1:], columns=data[0])
	print(f"✅ Cached {_known_samples_cache.shape[0]} rows from known_samples")

	except APIError as e:
	print(f"❌ APIError while loading known_samples: {e}")
	_known_samples_cache = pd.DataFrame()
	except Exception as e:
	import traceback
	print("❌ Exception occurred while loading known_samples:")
	traceback.print_exc()
	_known_samples_cache = pd.DataFrame()

	def check_known_output(accession):
	"""Check if an accession exists in the cached 'known_samples' sheet."""
	global _known_samples_cache
	print("inside check known output function")

	try:
	# Load cache if not already loaded
	if _known_samples_cache is None:
	load_known_samples()

	if _known_samples_cache.empty:
	print("⚠️ No cached data available.")
	return None

	# Extract proper accession format (e.g. AB12345)
	match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
	if match:
	accession = match.group(0)

	matched = _known_samples_cache[
	_known_samples_cache["Sample ID"].str.contains(accession, case=False, na=False)
	]

	if not matched.empty:
	row = matched.iloc[0]
	country = row.get("Predicted Country", "").strip().lower()
	sample_type = row.get("Predicted Sample Type", "").strip().lower()

	if country and country != "unknown" and sample_type and sample_type != "unknown":
	print(f"🎯 Found {accession} in cache")
	return row.to_dict()
	else:
	print(f"⚠️ Accession {accession} found but country/sample_type unknown or empty.")
	return None
	else:
	print(f"🔍 Accession {accession} not found in cache.")
	return None

	except Exception as e:
	import traceback
	print("❌ Exception occurred during check_known_output:")
	traceback.print_exc()
	return None



	def hash_user_id(user_input):
	return hashlib.sha256(user_input.encode()).hexdigest()

	# ✅ Load and save usage count

	# def load_user_usage():
	# if not os.path.exists(USER_USAGE_TRACK_FILE):
	# return {}

	# try:
	# with open(USER_USAGE_TRACK_FILE, "r") as f:
	# content = f.read().strip()
	# if not content:
	# return {} # file is empty
	# return json.loads(content)
	# except (json.JSONDecodeError, ValueError):
	# print("⚠️ Warning: user_usage.json is corrupted or invalid. Resetting.")
	# return {} # fallback to empty dict
	# def load_user_usage():
	# try:
	# creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	# scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	# creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	# client = gspread.authorize(creds)

	# sheet = client.open("user_usage_log").sheet1
	# data = sheet.get_all_records() # Assumes columns: email, usage_count

	# usage = {}
	# for row in data:
	# email = row.get("email", "").strip().lower()
	# count = int(row.get("usage_count", 0))
	# if email:
	# usage[email] = count
	# return usage
	# except Exception as e:
	# print(f"⚠️ Failed to load user usage from Google Sheets: {e}")
	# return {}
	# def load_user_usage():
	# try:
	# parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
	# iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)

	# found = pipeline.find_drive_file("user_usage_log.json", parent_id=iterate3_id)
	# if not found:
	# return {} # not found, start fresh

	# #file_id = found[0]["id"]
	# file_id = found
	# content = pipeline.download_drive_file_content(file_id)
	# return json.loads(content.strip()) if content.strip() else {}

	# except Exception as e:
	# print(f"⚠️ Failed to load user_usage_log.json from Google Drive: {e}")
	# return {}
	def load_user_usage():
	try:
	creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	client = gspread.authorize(creds)

	sheet = client.open("user_usage_log").sheet1
	data = sheet.get_all_values()
	print("data: ", data)
	print("🧪 Raw header row from sheet:", data[0])
	print("🧪 Character codes in each header:")
	for h in data[0]:
	print([ord(c) for c in h])

	if not data or len(data) < 2:
	print("⚠️ Sheet is empty or missing rows.")
	return {}

	headers = [h.strip().lower() for h in data[0]]
	if "email" not in headers or "usage_count" not in headers:
	print("❌ Header format incorrect. Must have 'email' and 'usage_count'.")
	return {}

	permitted_index = headers.index("permitted_samples") if "permitted_samples" in headers else None
	df = pd.DataFrame(data[1:], columns=headers)

	usage = {}
	permitted = {}
	for _, row in df.iterrows():
	email = row.get("email", "").strip().lower()
	try:
	#count = int(row.get("usage_count", 0))
	try:
	count = int(float(row.get("usage_count", 0)))
	except Exception:
	print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
	count = 0

	if email:
	usage[email] = count
	if permitted_index is not None:
	try:
	permitted_count = int(float(row.get("permitted_samples", 50)))
	permitted[email] = permitted_count
	except:
	permitted[email] = 50

	except ValueError:
	print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
	return usage, permitted

	except Exception as e:
	print(f"❌ Error in load_user_usage: {e}")
	return {}, {}



	# def save_user_usage(usage):
	# with open(USER_USAGE_TRACK_FILE, "w") as f:
	# json.dump(usage, f, indent=2)

	# def save_user_usage(usage_dict):
	# try:
	# creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	# scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	# creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	# client = gspread.authorize(creds)

	# sheet = client.open("user_usage_log").sheet1
	# sheet.clear() # clear old contents first

	# # Write header + rows
	# rows = [["email", "usage_count"]] + [[email, count] for email, count in usage_dict.items()]
	# sheet.update(rows)
	# except Exception as e:
	# print(f"❌ Failed to save user usage to Google Sheets: {e}")
	# def save_user_usage(usage_dict):
	# try:
	# parent_id = pipeline.get_or_create_drive_folder("mtDNA-Location-Classifier")
	# iterate3_id = pipeline.get_or_create_drive_folder("iterate3", parent_id=parent_id)

	# import tempfile
	# tmp_path = os.path.join(tempfile.gettempdir(), "user_usage_log.json")
	# print("💾 Saving this usage dict:", usage_dict)
	# with open(tmp_path, "w") as f:
	# json.dump(usage_dict, f, indent=2)

	# pipeline.upload_file_to_drive(tmp_path, "user_usage_log.json", iterate3_id)

	# except Exception as e:
	# print(f"❌ Failed to save user_usage_log.json to Google Drive: {e}")
	# def save_user_usage(usage_dict):
	# try:
	# creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	# scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	# creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	# client = gspread.authorize(creds)

	# spreadsheet = client.open("user_usage_log")
	# sheet = spreadsheet.sheet1

	# # Step 1: Convert new usage to DataFrame
	# df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
	# df_new["email"] = df_new["email"].str.strip().str.lower()

	# # Step 2: Load existing data
	# existing_data = sheet.get_all_values()
	# print("🧪 Sheet existing_data:", existing_data)

	# # Try to load old data
	# if existing_data and len(existing_data[0]) >= 1:
	# df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])

	# # Fix missing columns
	# if "email" not in df_old.columns:
	# df_old["email"] = ""
	# if "usage_count" not in df_old.columns:
	# df_old["usage_count"] = 0

	# df_old["email"] = df_old["email"].str.strip().str.lower()
	# df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
	# else:
	# df_old = pd.DataFrame(columns=["email", "usage_count"])

	# # Step 3: Merge
	# df_combined = pd.concat([df_old, df_new], ignore_index=True)
	# df_combined = df_combined.groupby("email", as_index=False).sum()

	# # Step 4: Write back
	# sheet.clear()
	# sheet.update([df_combined.columns.tolist()] + df_combined.astype(str).values.tolist())
	# print("✅ Saved user usage to user_usage_log sheet.")

	# except Exception as e:
	# print(f"❌ Failed to save user usage to Google Sheets: {e}")
	def save_user_usage(usage_dict):
	try:
	creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
	scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
	creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
	client = gspread.authorize(creds)

	spreadsheet = client.open("user_usage_log")
	sheet = spreadsheet.sheet1

	# Build new df
	df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
	df_new["email"] = df_new["email"].str.strip().str.lower()
	df_new["usage_count"] = pd.to_numeric(df_new["usage_count"], errors="coerce").fillna(0).astype(int)

	# Read existing data
	existing_data = sheet.get_all_values()
	if existing_data and len(existing_data[0]) >= 2:
	df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
	df_old["email"] = df_old["email"].str.strip().str.lower()
	df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
	else:
	df_old = pd.DataFrame(columns=["email", "usage_count"])

	# ✅ Overwrite specific emails only
	df_old = df_old.set_index("email")
	for email, count in usage_dict.items():
	email = email.strip().lower()
	df_old.loc[email, "usage_count"] = count
	df_old = df_old.reset_index()

	# Save
	sheet.clear()
	sheet.update([df_old.columns.tolist()] + df_old.astype(str).values.tolist())
	print("✅ Saved user usage to user_usage_log sheet.")

	except Exception as e:
	print(f"❌ Failed to save user usage to Google Sheets: {e}")




	# def increment_usage(user_id, num_samples=1):
	# usage = load_user_usage()
	# if user_id not in usage:
	# usage[user_id] = 0
	# usage[user_id] += num_samples
	# save_user_usage(usage)
	# return usage[user_id]
	# def increment_usage(email: str, count: int):
	# usage = load_user_usage()
	# email_key = email.strip().lower()
	# usage[email_key] = usage.get(email_key, 0) + count
	# save_user_usage(usage)
	# return usage[email_key]
	def increment_usage(email: str, count: int = 1):
	usage, permitted = load_user_usage()
	email_key = email.strip().lower()
	#usage[email_key] = usage.get(email_key, 0) + count
	current = usage.get(email_key, 0)
	new_value = current + count
	max_allowed = permitted.get(email_key) or 50
	usage[email_key] = max(current, new_value) # ✅ Prevent overwrite with lower
	print(f"🧪 increment_usage saving: {email_key=} {current=} + {count=} => {usage[email_key]=}")
	print("max allow is: ", max_allowed)
	save_user_usage(usage)
	return usage[email_key], max_allowed


	# run the batch
	def summarize_batch(file=None, raw_text="", resume_file=None, user_email="",
	stop_flag=None, output_file_path=None,
	limited_acc=50, yield_callback=None):
	if user_email:
	limited_acc += 10
	accessions, error = extract_accessions_from_input(file, raw_text)
	if error:
	#return [], "", "", f"Error: {error}"
	return [], f"Error: {error}", 0, "", ""
	if resume_file:
	accessions = get_incomplete_accessions(resume_file)
	tmp_dir = tempfile.mkdtemp()
	if not output_file_path:
	if resume_file:
	output_file_path = os.path.join(tmp_dir, resume_file)
	else:
	output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")

	all_rows = []
	# all_summaries = []
	# all_flags = []
	progress_lines = []
	warning = ""
	if len(accessions) > limited_acc:
	accessions = accessions[:limited_acc]
	warning = f"Your number of accessions is more than the {limited_acc}, only handle first {limited_acc} accessions"
	for i, acc in enumerate(accessions):
	if stop_flag and stop_flag.value:
	line = f"🛑 Stopped at {acc} ({i+1}/{len(accessions)})"
	progress_lines.append(line)
	if yield_callback:
	yield_callback(line)
	print("🛑 User requested stop.")
	break
	print(f"[{i+1}/{len(accessions)}] Processing {acc}")
	try:
	# rows, summary, label, explain = summarize_results(acc)
	rows = summarize_results(acc)
	all_rows.extend(rows)
	# all_summaries.append(f"{acc}\n{summary}")
	# all_flags.append(f"{acc}\n### 🏺 Ancient/Modern Flag\n{label}\n\n_Explanation:_ {explain}")
	#save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path)
	save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path, is_resume=bool(resume_file))
	line = f"✅ Processed {acc} ({i+1}/{len(accessions)})"
	progress_lines.append(line)
	if yield_callback:
	yield_callback(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
	except Exception as e:
	print(f"❌ Failed to process {acc}: {e}")
	continue
	#all_summaries.append(f"{acc}: Failed - {e}")
	#progress_lines.append(f"✅ Processed {acc} ({i+1}/{len(accessions)})")
	limited_acc -= 1
	"""for row in all_rows:
	source_column = row[2] # Assuming the "Source" is in the 3rd column (index 2)

	if source_column.startswith("http"): # Check if the source is a URL
	# Wrap it with HTML anchor tags to make it clickable
	row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
	if not warning:
	warning = f"You only have {limited_acc} left"
	if user_email.strip():
	user_hash = hash_user_id(user_email)
	total_queries = increment_usage(user_hash, len(all_rows))
	else:
	total_queries = 0
	yield_callback("✅ Finished!")

	# summary_text = "\n\n---\n\n".join(all_summaries)
	# flag_text = "\n\n---\n\n".join(all_flags)
	#return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)
	#return all_rows, gr.update(visible=True), gr.update(visible=False)
	return all_rows, output_file_path, total_queries, "\n".join(progress_lines), warning