import os import json from dotenv import load_dotenv from langchain.schema import Document from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import SupabaseVectorStore from supabase.client import Client, create_client # === Load environment variables === load_dotenv() SUPABASE_URL = os.environ.get("SUPABASE_URL") SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY") # === Initialize Supabase and embeddings === supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # dim=768 # === Load metadata.jsonl === def load_jsonl(file_path: str) -> list[dict]: with open(file_path, 'r') as f: return [json.loads(line) for line in f] # === Convert JSON entries to documents with embeddings === def build_documents(json_data: list[dict]) -> list[dict]: docs = [] for item in json_data: content = f"Question: {item['Question']}\n\nFinal answer: {item['Final answer']}" docs.append({ "content": content, "metadata": {"source": item['task_id']}, # must contain 'source' key for SupabaseVectorStore "embedding": embeddings.embed_query(content), }) return docs # === Insert documents into Supabase === def insert_documents(docs: list[dict]): try: response = supabase.table("documents").insert(docs).execute() print("Inserted documents:", len(docs)) except Exception as e: print("❌ Error inserting data into Supabase:", e) # === (Optional) Save docs to CSV === def save_docs_to_csv(docs: list[dict], filename: str = "supabase_docs.csv"): import pandas as pd df = pd.DataFrame(docs) df.to_csv(filename, index=False) print(f"Saved documents to {filename}") # === Test querying the vector store === def test_vector_query(query: str): vector_store = SupabaseVectorStore( client=supabase, embedding=embeddings, table_name="documents", query_name="match_documents", # make sure this function exists in Supabase ) retriever = vector_store.as_retriever() results = retriever.invoke(query) print("Top match:") print(results[0].page_content) # === Main execution === def main(): json_data = load_jsonl("metadata.jsonl") documents = build_documents(json_data) insert_documents(documents) # Uncomment below if you want to manually upload the data # save_docs_to_csv(documents) query = ( "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. " "This article mentions a team that produced a paper about their observations, linked at the bottom. " "Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?" ) test_vector_query(query) if __name__ == "__main__": main()