Spaces:

lsempe77
/

fcas

Sleeping

App Files Files Community

fcas / data_handler.py

lsempe

Clean repo, remove binary history

9c062cd 6 months ago

raw

history blame contribute delete

2.33 kB

	import pandas as pd
	import re
	from synthesis_qa_backend import ResearchSynthesizer
	from config import API_KEY, INDEX_PATH, METADATA_PATH, SPECIFIC_COUNTRIES

	class DataHandler:
	def __init__(self):
	self.synthesizer = None
	self.docs_df = pd.DataFrame()
	self.countries_list = []
	self.sectors_list = []
	self.load_data()

	def load_data(self):
	"""Initialize the research system and load data"""
	try:
	self.synthesizer = ResearchSynthesizer(INDEX_PATH, METADATA_PATH, API_KEY)
	metadata_df = pd.read_csv(METADATA_PATH)
	self.docs_df = metadata_df.drop_duplicates(subset=['record_id'])
	print(f"✅ Loaded {len(self.docs_df)} unique documents")

	# Get unique values for dropdowns
	self.countries_list, self.sectors_list = self._get_unique_values()

	except Exception as e:
	print(f"❌ Error loading system: {e}")
	self.synthesizer = None
	self.docs_df = pd.DataFrame()

	def _get_unique_values(self):
	"""Get unique values for dropdowns"""
	if self.docs_df.empty:
	return [], []

	countries_list = []
	sectors_list = []

	if 'study_countries' in self.docs_df.columns:
	for countries_str in self.docs_df['study_countries'].dropna():
	if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
	continue
	countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
	filtered = [c for c in countries if c in SPECIFIC_COUNTRIES and len(c) > 1]
	countries_list.extend(filtered)

	countries_list = sorted(list(set(countries_list)))

	if 'world_bank_sector' in self.docs_df.columns:
	sectors_list = sorted(self.docs_df['world_bank_sector'].dropna().unique().tolist())

	return countries_list, sectors_list

	def get_data(self):
	"""Return all data objects"""
	return {
	'synthesizer': self.synthesizer,
	'docs_df': self.docs_df,
	'countries_list': self.countries_list,
	'sectors_list': self.sectors_list
	}