Spaces:

ABE101
/

april28ragdivreyyoel

Sleeping

App Files Files Community

april28ragdivreyyoel / utils.py

ABE101

Upload 5 files

ae4184d verified 2 months ago

raw

history blame contribute delete

3.59 kB

	# utils.py (Updated for OpenAI context formatting)
	import re
	import os
	import time
	import traceback
	import openai
	from typing import Optional, List, Dict

	try:
	import config
	except ImportError:
	print("Error: config.py not found. Cannot proceed.")
	raise SystemExit("config.py not found")

	# ... (keep openai_client init, clean_source_text, get_embedding) ...
	openai_client = None
	if config.OPENAI_API_KEY:
	try:
	openai_client = openai.OpenAI(api_key=config.OPENAI_API_KEY)
	print("Utils: OpenAI client initialized for embeddings.")
	except Exception as e:
	print(f"Utils: Error initializing OpenAI client for embeddings: {e}")
	else:
	print("Utils: Warning - OPENAI_API_KEY not found. Embeddings will fail.")

	def clean_source_text(text: Optional[str]) -> str:
	if not text: return ""
	text = text.replace('\x00', '').replace('\ufffd', '')
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def get_embedding(text: str, model: str = config.EMBEDDING_MODEL, max_retries: int = 3) -> Optional[List[float]]:
	global openai_client
	if not openai_client:
	print("Error: OpenAI client not initialized (utils.py). Cannot get embedding.")
	return None
	if not text or not isinstance(text, str):
	print("Error: Invalid input text for embedding.")
	return None
	cleaned_text = text.replace("\n", " ").strip()
	if not cleaned_text:
	print("Warning: Text is empty after cleaning, cannot get embedding.")
	return None
	attempt = 0
	while attempt < max_retries:
	try:
	response = openai_client.embeddings.create(input=[cleaned_text], model=model)
	return response.data[0].embedding
	except openai.RateLimitError as e:
	wait_time = (2 ** attempt); print(f"Rate limit embedding. Retrying in {wait_time}s..."); time.sleep(wait_time)
	attempt += 1
	except openai.APIConnectionError as e:
	print(f"Connection error embedding. Retrying..."); time.sleep(2)
	attempt += 1
	except Exception as e:
	print(f"Error generating embedding (Attempt {attempt + 1}/{max_retries}): {type(e).__name__}")
	attempt += 1
	print(f"Failed embedding after {max_retries} attempts.")
	return None

	# --- REMOVED format_context_for_anthropic ---

	# --- NEW Function to format context for OpenAI ---
	def format_context_for_openai(documents: List[Dict]) -> str:
	"""Formats documents for the OpenAI prompt context section using numbered list."""
	if not documents:
	return "No source texts provided."
	formatted_docs = []
	language_key = 'hebrew_text'
	id_key = 'original_id'
	source_key = 'source_name' # Optional: Include source name if available

	for index, doc in enumerate(documents):
	if not isinstance(doc, dict):
	print(f"Warning: Skipping non-dict item in documents list: {doc}")
	continue

	text = clean_source_text(doc.get(language_key, ''))
	doc_id = doc.get(id_key, f'unknown_{index+1}')
	source_name = doc.get(source_key, '') # Get source name

	if text:
	# Start with 1-based indexing for readability
	header = f"Source {index + 1} (ID: {doc_id}"
	if source_name:
	header += f", SourceName: {source_name}"
	header += ")"
	formatted_docs.append(f"{header}:\n{text}\n---") # Add separator

	if not formatted_docs:
	return "No valid source texts could be formatted."

	return "\n".join(formatted_docs)