Spaces:
Running
Running
# utils.py (Updated for OpenAI context formatting) | |
import re | |
import os | |
import time | |
import traceback | |
import openai | |
from typing import Optional, List, Dict | |
try: | |
import config | |
except ImportError: | |
print("Error: config.py not found. Cannot proceed.") | |
raise SystemExit("config.py not found") | |
# ... (keep openai_client init, clean_source_text, get_embedding) ... | |
openai_client = None | |
if config.OPENAI_API_KEY: | |
try: | |
openai_client = openai.OpenAI(api_key=config.OPENAI_API_KEY) | |
print("Utils: OpenAI client initialized for embeddings.") | |
except Exception as e: | |
print(f"Utils: Error initializing OpenAI client for embeddings: {e}") | |
else: | |
print("Utils: Warning - OPENAI_API_KEY not found. Embeddings will fail.") | |
def clean_source_text(text: Optional[str]) -> str: | |
if not text: return "" | |
text = text.replace('\x00', '').replace('\ufffd', '') | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def get_embedding(text: str, model: str = config.EMBEDDING_MODEL, max_retries: int = 3) -> Optional[List[float]]: | |
global openai_client | |
if not openai_client: | |
print("Error: OpenAI client not initialized (utils.py). Cannot get embedding.") | |
return None | |
if not text or not isinstance(text, str): | |
print("Error: Invalid input text for embedding.") | |
return None | |
cleaned_text = text.replace("\n", " ").strip() | |
if not cleaned_text: | |
print("Warning: Text is empty after cleaning, cannot get embedding.") | |
return None | |
attempt = 0 | |
while attempt < max_retries: | |
try: | |
response = openai_client.embeddings.create(input=[cleaned_text], model=model) | |
return response.data[0].embedding | |
except openai.RateLimitError as e: | |
wait_time = (2 ** attempt); print(f"Rate limit embedding. Retrying in {wait_time}s..."); time.sleep(wait_time) | |
attempt += 1 | |
except openai.APIConnectionError as e: | |
print(f"Connection error embedding. Retrying..."); time.sleep(2) | |
attempt += 1 | |
except Exception as e: | |
print(f"Error generating embedding (Attempt {attempt + 1}/{max_retries}): {type(e).__name__}") | |
attempt += 1 | |
print(f"Failed embedding after {max_retries} attempts.") | |
return None | |
# --- REMOVED format_context_for_anthropic --- | |
# --- NEW Function to format context for OpenAI --- | |
def format_context_for_openai(documents: List[Dict]) -> str: | |
"""Formats documents for the OpenAI prompt context section using numbered list.""" | |
if not documents: | |
return "No source texts provided." | |
formatted_docs = [] | |
language_key = 'hebrew_text' | |
id_key = 'original_id' | |
source_key = 'source_name' # Optional: Include source name if available | |
for index, doc in enumerate(documents): | |
if not isinstance(doc, dict): | |
print(f"Warning: Skipping non-dict item in documents list: {doc}") | |
continue | |
text = clean_source_text(doc.get(language_key, '')) | |
doc_id = doc.get(id_key, f'unknown_{index+1}') | |
source_name = doc.get(source_key, '') # Get source name | |
if text: | |
# Start with 1-based indexing for readability | |
header = f"Source {index + 1} (ID: {doc_id}" | |
if source_name: | |
header += f", SourceName: {source_name}" | |
header += ")" | |
formatted_docs.append(f"{header}:\n{text}\n---") # Add separator | |
if not formatted_docs: | |
return "No valid source texts could be formatted." | |
return "\n".join(formatted_docs) |