|
|
|
|
|
import gradio as gr |
|
import pandas as pd |
|
from typing import List, Dict, Any |
|
import asyncio |
|
from urllib.parse import urlparse |
|
|
|
|
|
from ankigen_core.utils import ( |
|
get_logger, |
|
ResponseCache, |
|
fetch_webpage_text, |
|
strip_html_tags, |
|
) |
|
from ankigen_core.llm_interface import OpenAIClientManager, structured_output_completion |
|
from ankigen_core.models import ( |
|
Card, |
|
CardFront, |
|
CardBack, |
|
) |
|
|
|
|
|
from ankigen_core.agents.integration import AgentOrchestrator |
|
from agents import set_tracing_disabled |
|
|
|
logger = get_logger() |
|
|
|
|
|
set_tracing_disabled(True) |
|
|
|
AGENTS_AVAILABLE = True |
|
logger.info("Agent system loaded successfully") |
|
|
|
|
|
AVAILABLE_MODELS = [ |
|
{ |
|
"value": "gpt-4.1", |
|
"label": "GPT-4.1 (Best Quality)", |
|
"description": "Highest quality, large context window", |
|
}, |
|
{ |
|
"value": "gpt-4.1-nano", |
|
"label": "GPT-4.1 Nano (Ultra Fast)", |
|
"description": "Ultra-fast and cost-effective", |
|
}, |
|
] |
|
|
|
GENERATION_MODES = [ |
|
{ |
|
"value": "subject", |
|
"label": "Single Subject", |
|
"description": "Generate cards for a specific topic", |
|
}, |
|
{ |
|
"value": "path", |
|
"label": "Learning Path", |
|
"description": "Break down a job description or learning goal into subjects", |
|
}, |
|
{ |
|
"value": "text", |
|
"label": "From Text", |
|
"description": "Generate cards from provided text", |
|
}, |
|
{ |
|
"value": "web", |
|
"label": "From Web", |
|
"description": "Generate cards from a web page URL", |
|
}, |
|
] |
|
|
|
|
|
|
|
|
|
async def generate_cards_batch( |
|
openai_client, |
|
cache: ResponseCache, |
|
model: str, |
|
topic: str, |
|
num_cards: int, |
|
system_prompt: str, |
|
generate_cloze: bool = False, |
|
batch_size: int = 3, |
|
): |
|
"""Generate a batch of cards for a topic, potentially including cloze deletions""" |
|
|
|
cloze_instruction = "" |
|
if generate_cloze: |
|
cloze_instruction = """ |
|
Where appropriate, generate Cloze deletion cards. |
|
- For Cloze cards, set "card_type" to "cloze". |
|
- Format the question field using Anki's cloze syntax (e.g., "The capital of France is {{c1::Paris}}."). |
|
- The "answer" field should contain the full, non-cloze text or specific context for the cloze. |
|
- For standard question/answer cards, set "card_type" to "basic". |
|
""" |
|
|
|
cards_prompt = f""" |
|
Generate {num_cards} flashcards for the topic: {topic} |
|
{cloze_instruction} |
|
Return your response as a JSON object with the following structure: |
|
{{ |
|
"cards": [ |
|
{{ |
|
"card_type": "basic or cloze", |
|
"front": {{ |
|
"question": "question text (potentially with {{{{c1::cloze syntax}}}})" |
|
}}, |
|
"back": {{ |
|
"answer": "concise answer or full text for cloze", |
|
"explanation": "detailed explanation", |
|
"example": "practical example" |
|
}}, |
|
"metadata": {{ |
|
"prerequisites": ["list", "of", "prerequisites"], |
|
"learning_outcomes": ["list", "of", "outcomes"], |
|
"misconceptions": ["list", "of", "misconceptions"], |
|
"difficulty": "beginner/intermediate/advanced" |
|
}} |
|
}} |
|
// ... more cards |
|
] |
|
}} |
|
""" |
|
|
|
try: |
|
logger.info( |
|
f"Generating card batch for {topic}, Cloze enabled: {generate_cloze}" |
|
) |
|
|
|
response = await structured_output_completion( |
|
openai_client=openai_client, |
|
model=model, |
|
response_format={"type": "json_object"}, |
|
system_prompt=system_prompt, |
|
user_prompt=cards_prompt, |
|
cache=cache, |
|
) |
|
|
|
if not response or "cards" not in response: |
|
logger.error("Invalid cards response format") |
|
raise ValueError("Failed to generate cards. Please try again.") |
|
|
|
cards_list = [] |
|
for card_data in response["cards"]: |
|
if "front" not in card_data or "back" not in card_data: |
|
logger.warning( |
|
f"Skipping card due to missing front/back data: {card_data}" |
|
) |
|
continue |
|
if "question" not in card_data["front"]: |
|
logger.warning(f"Skipping card due to missing question: {card_data}") |
|
continue |
|
if ( |
|
"answer" not in card_data["back"] |
|
or "explanation" not in card_data["back"] |
|
or "example" not in card_data["back"] |
|
): |
|
logger.warning( |
|
f"Skipping card due to missing answer/explanation/example: {card_data}" |
|
) |
|
continue |
|
|
|
|
|
card = Card( |
|
card_type=card_data.get("card_type", "basic"), |
|
front=CardFront( |
|
question=strip_html_tags(card_data["front"].get("question", "")) |
|
), |
|
back=CardBack( |
|
answer=strip_html_tags(card_data["back"].get("answer", "")), |
|
explanation=strip_html_tags( |
|
card_data["back"].get("explanation", "") |
|
), |
|
example=strip_html_tags(card_data["back"].get("example", "")), |
|
), |
|
metadata=card_data.get("metadata", {}), |
|
) |
|
cards_list.append(card) |
|
|
|
return cards_list |
|
|
|
except Exception as e: |
|
logger.error( |
|
f"Failed to generate cards batch for {topic}: {str(e)}", exc_info=True |
|
) |
|
raise |
|
|
|
|
|
async def judge_card( |
|
openai_client, |
|
cache: ResponseCache, |
|
model: str, |
|
card: Card, |
|
) -> bool: |
|
"""Use an LLM to validate a single card.""" |
|
system_prompt = ( |
|
"You review flashcards and decide if the question is clear and useful. " |
|
'Respond with a JSON object like {"is_valid": true}.' |
|
) |
|
user_prompt = f"Question: {card.front.question}\nAnswer: {card.back.answer}" |
|
try: |
|
result = await structured_output_completion( |
|
openai_client=openai_client, |
|
model=model, |
|
response_format={"type": "json_object"}, |
|
system_prompt=system_prompt, |
|
user_prompt=user_prompt, |
|
cache=cache, |
|
) |
|
if isinstance(result, dict): |
|
return bool(result.get("is_valid", True)) |
|
except Exception as e: |
|
logger.warning(f"LLM judge failed for card '{card.front.question}': {e}") |
|
return True |
|
|
|
|
|
async def judge_cards( |
|
openai_client, |
|
cache: ResponseCache, |
|
model: str, |
|
cards: List[Card], |
|
) -> List[Card]: |
|
"""Filter cards using the LLM judge.""" |
|
validated: List[Card] = [] |
|
for card in cards: |
|
if await judge_card(openai_client, cache, model, card): |
|
validated.append(card) |
|
else: |
|
logger.info(f"Card rejected by judge: {card.front.question}") |
|
return validated |
|
|
|
|
|
async def orchestrate_card_generation( |
|
client_manager: OpenAIClientManager, |
|
cache: ResponseCache, |
|
|
|
api_key_input: str, |
|
subject: str, |
|
generation_mode: str, |
|
source_text: str, |
|
url_input: str, |
|
model_name: str, |
|
topic_number: int, |
|
cards_per_topic: int, |
|
preference_prompt: str, |
|
generate_cloze: bool, |
|
use_llm_judge: bool = False, |
|
): |
|
"""Orchestrates the card generation process based on UI inputs.""" |
|
|
|
logger.info(f"Starting card generation orchestration in {generation_mode} mode") |
|
logger.debug( |
|
f"Parameters: mode={generation_mode}, topics={topic_number}, cards_per_topic={cards_per_topic}, cloze={generate_cloze}" |
|
) |
|
|
|
|
|
if AGENTS_AVAILABLE: |
|
logger.info("🤖 Using agent system for card generation") |
|
try: |
|
|
|
from ankigen_core.agents.token_tracker import get_token_tracker |
|
|
|
token_tracker = get_token_tracker() |
|
|
|
|
|
|
|
orchestrator = AgentOrchestrator(client_manager) |
|
|
|
|
|
logger.info(f"Overriding all agent models to use: {model_name}") |
|
model_overrides = { |
|
"generation_coordinator": model_name, |
|
"subject_expert": model_name, |
|
"pedagogical_agent": model_name, |
|
"content_structuring": model_name, |
|
"enhancement_agent": model_name, |
|
"revision_agent": model_name, |
|
"content_accuracy_judge": model_name, |
|
"pedagogical_judge": model_name, |
|
"clarity_judge": model_name, |
|
"technical_judge": model_name, |
|
"completeness_judge": model_name, |
|
"judge_coordinator": model_name, |
|
} |
|
|
|
|
|
await orchestrator.initialize(api_key_input, model_overrides) |
|
|
|
|
|
agent_subject = "general" |
|
if generation_mode == "subject": |
|
agent_subject = subject if subject else "general" |
|
elif generation_mode == "path": |
|
agent_subject = "curriculum_design" |
|
elif generation_mode == "text": |
|
agent_subject = "content_analysis" |
|
|
|
|
|
total_cards_needed = topic_number * cards_per_topic |
|
|
|
|
|
context = {} |
|
if generation_mode == "text" and source_text: |
|
context["source_text"] = source_text |
|
|
|
|
|
agent_cards, agent_metadata = await orchestrator.generate_cards_with_agents( |
|
topic=subject if subject else "Mixed Topics", |
|
subject=agent_subject, |
|
num_cards=total_cards_needed, |
|
difficulty="intermediate", |
|
enable_quality_pipeline=True, |
|
context=context, |
|
) |
|
|
|
|
|
try: |
|
|
|
if hasattr(token_tracker, "get_session_summary"): |
|
token_usage = token_tracker.get_session_summary() |
|
elif hasattr(token_tracker, "get_session_usage"): |
|
token_usage = token_tracker.get_session_usage() |
|
else: |
|
raise AttributeError("TokenTracker has no session summary method") |
|
|
|
token_usage_html = f"<div style='margin-top: 8px;'><b>Token Usage:</b> {token_usage['total_tokens']} tokens</div>" |
|
except Exception as e: |
|
logger.error(f"Token usage collection failed: {e}") |
|
token_usage_html = "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>" |
|
|
|
|
|
if agent_cards: |
|
formatted_cards = format_cards_for_dataframe( |
|
agent_cards, |
|
topic_name=f"Agent Generated - {subject}" |
|
if subject |
|
else "Agent Generated", |
|
start_index=1, |
|
) |
|
|
|
output_df = pd.DataFrame( |
|
formatted_cards, columns=get_dataframe_columns() |
|
) |
|
total_cards_message = f"<div><b>🤖 Agent Generated Cards:</b> <span id='total-cards-count'>{len(output_df)}</span></div>" |
|
|
|
logger.info( |
|
f"Agent system generated {len(output_df)} cards successfully" |
|
) |
|
return output_df, total_cards_message, token_usage_html |
|
else: |
|
logger.error("Agent system returned no cards") |
|
gr.Error("🤖 Agent system returned no cards") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"Agent system returned no cards.", |
|
"", |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"Agent system failed: {e}") |
|
gr.Error(f"🤖 Agent system error: {str(e)}") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
f"Agent system error: {str(e)}", |
|
"", |
|
) |
|
|
|
|
|
logger.error("Agent system not available but required") |
|
if not api_key_input: |
|
logger.warning("No API key provided to orchestrator") |
|
gr.Error("OpenAI API key is required") |
|
return pd.DataFrame(columns=get_dataframe_columns()), "API key is required.", 0 |
|
|
|
|
|
try: |
|
|
|
await client_manager.initialize_client(api_key_input) |
|
openai_client = client_manager.get_client() |
|
except (ValueError, RuntimeError, Exception) as e: |
|
logger.error(f"Client initialization failed in orchestrator: {e}") |
|
gr.Error(f"OpenAI Client Error: {e}") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
f"OpenAI Client Error: {e}", |
|
0, |
|
) |
|
|
|
model = model_name |
|
flattened_data = [] |
|
total_cards_generated = 0 |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if generation_mode == "subject": |
|
logger.info("Orchestrator: Subject Mode") |
|
if not subject or not subject.strip(): |
|
gr.Error("Subject is required for 'Single Subject' mode.") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"Subject is required.", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
system_prompt = f"""You are an expert in {subject} and an experienced educator. {preference_prompt}""" |
|
|
|
individual_subjects = [s.strip() for s in subject.split(",") if s.strip()] |
|
if ( |
|
not individual_subjects |
|
): |
|
gr.Error("Valid subject(s) required.") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"Valid subject(s) required.", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
|
|
topics_for_generation = [] |
|
|
|
|
|
for ind_subject in individual_subjects: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(individual_subjects) == 1: |
|
|
|
|
|
|
|
|
|
|
|
|
|
if topic_number == 1: |
|
topics_for_generation.append( |
|
{"name": ind_subject, "num_cards": cards_per_topic} |
|
) |
|
else: |
|
|
|
|
|
|
|
for i in range(topic_number): |
|
topics_for_generation.append( |
|
{ |
|
"name": f"{ind_subject} - Aspect {i + 1}", |
|
"num_cards": cards_per_topic, |
|
} |
|
) |
|
else: |
|
topics_for_generation.append( |
|
{"name": ind_subject, "num_cards": cards_per_topic} |
|
) |
|
|
|
|
|
elif generation_mode == "path": |
|
logger.info("Orchestrator: Learning Path Mode") |
|
|
|
|
|
if ( |
|
not subject or not subject.strip() |
|
): |
|
gr.Error("No subjects provided from learning path analysis.") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"No subjects from path analysis.", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
|
|
system_prompt = f"""You are an expert in curriculum design and an experienced educator. {preference_prompt}""" |
|
analyzed_subjects = [s.strip() for s in subject.split(",") if s.strip()] |
|
if not analyzed_subjects: |
|
gr.Error("No valid subjects parsed from learning path.") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"No valid subjects from path.", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
|
|
|
|
|
|
|
|
topics_for_generation = [ |
|
{"name": subj, "num_cards": cards_per_topic} |
|
for subj in analyzed_subjects |
|
] |
|
|
|
|
|
elif generation_mode == "text": |
|
logger.info("Orchestrator: Text Mode") |
|
actual_text_to_process = source_text |
|
|
|
if ( |
|
not actual_text_to_process or not actual_text_to_process.strip() |
|
): |
|
gr.Error("Text input is empty.") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"Text input is empty.", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
|
|
|
|
|
|
is_url = False |
|
if isinstance(source_text, str) and source_text.strip().lower().startswith( |
|
("http://", "https://") |
|
): |
|
try: |
|
|
|
result = urlparse(source_text.strip()) |
|
if all([result.scheme, result.netloc]): |
|
is_url = True |
|
except ImportError: |
|
pass |
|
|
|
if is_url: |
|
url_to_fetch = source_text.strip() |
|
logger.info(f"Text mode identified URL: {url_to_fetch}") |
|
gr.Info(f"🕸️ Fetching content from URL in text field: {url_to_fetch}...") |
|
try: |
|
page_content = await asyncio.to_thread( |
|
fetch_webpage_text, url_to_fetch |
|
) |
|
if not page_content or not page_content.strip(): |
|
gr.Warning( |
|
f"Could not extract meaningful text from URL: {url_to_fetch}. Please check the URL or page content." |
|
) |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"No meaningful text extracted from URL.", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
actual_text_to_process = page_content |
|
source_text_display_name = f"Content from {url_to_fetch}" |
|
gr.Info( |
|
f"✅ Successfully fetched text from URL (approx. {len(actual_text_to_process)} chars)." |
|
) |
|
except Exception as e: |
|
logger.error( |
|
f"Failed to fetch or process URL {url_to_fetch} in text mode: {e}", |
|
exc_info=True, |
|
) |
|
gr.Error(f"Failed to fetch content from URL: {str(e)}") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
f"URL fetch error: {str(e)}", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
else: |
|
if ( |
|
not source_text or not source_text.strip() |
|
): |
|
gr.Error("Text input is empty.") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"Text input is empty.", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
actual_text_to_process = source_text |
|
source_text_display_name = "Content from Provided Text" |
|
logger.info("Text mode: Processing provided text directly.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
system_prompt = f"""You are an expert in distilling information and creating flashcards from text. {preference_prompt} |
|
Base your flashcards STRICTLY on the following text content provided by the user in their next message. |
|
Do not use external knowledge unless explicitly asked to clarify something from the text. |
|
The user will provide the text content that needs to be turned into flashcards.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_mode_user_prompt = f""" |
|
Please generate {cards_per_topic * topic_number} flashcards based on the following text content. |
|
I have already provided the text content in the system prompt (or it is implicitly part of this context). |
|
Ensure the flashcards cover diverse aspects of the text. |
|
{get_cloze_instruction(generate_cloze)} |
|
Return your response as a JSON object with the following structure: |
|
{get_card_json_structure_prompt()} |
|
|
|
Text Content to process: |
|
--- |
|
{actual_text_to_process[:15000]} |
|
--- |
|
""" |
|
|
|
gr.Info(f"Generating cards from: {source_text_display_name}...") |
|
try: |
|
response = await structured_output_completion( |
|
openai_client=openai_client, |
|
model=model, |
|
response_format={"type": "json_object"}, |
|
system_prompt=system_prompt, |
|
user_prompt=text_mode_user_prompt, |
|
cache=cache, |
|
) |
|
raw_cards = [] |
|
if response: |
|
raw_cards = response.get("cards", []) |
|
else: |
|
logger.warning( |
|
"structured_output_completion returned None, defaulting to empty card list for text mode." |
|
) |
|
processed_cards = process_raw_cards_data(raw_cards) |
|
if use_llm_judge and processed_cards: |
|
processed_cards = await judge_cards( |
|
openai_client, cache, model, processed_cards |
|
) |
|
formatted_cards = format_cards_for_dataframe( |
|
processed_cards, topic_name=source_text_display_name, start_index=1 |
|
) |
|
flattened_data.extend(formatted_cards) |
|
total_cards_generated += len(formatted_cards) |
|
|
|
|
|
topics_for_generation = [] |
|
|
|
except Exception as e: |
|
logger.error( |
|
f"Error during 'From Text' card generation: {e}", exc_info=True |
|
) |
|
gr.Error(f"Error generating cards from text: {str(e)}") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
f"Text Gen Error: {str(e)}", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
|
|
else: |
|
logger.error(f"Unknown generation mode: {generation_mode}") |
|
gr.Error(f"Unknown generation mode: {generation_mode}") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"Unknown mode.", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
|
|
|
|
|
|
|
|
|
|
for topic_info in ( |
|
topics_for_generation |
|
): |
|
|
|
|
|
|
|
gr.Info( |
|
f"Generating cards for topic: {topic_info['name']}..." |
|
) |
|
|
|
try: |
|
|
|
|
|
batch_cards = await generate_cards_batch( |
|
openai_client, |
|
cache, |
|
model, |
|
topic_info["name"], |
|
topic_info["num_cards"], |
|
system_prompt, |
|
generate_cloze, |
|
) |
|
if use_llm_judge and batch_cards: |
|
batch_cards = await judge_cards( |
|
openai_client, cache, model, batch_cards |
|
) |
|
|
|
formatted_batch = format_cards_for_dataframe( |
|
batch_cards, |
|
topic_name=topic_info["name"], |
|
start_index=total_cards_generated + 1, |
|
) |
|
flattened_data.extend(formatted_batch) |
|
total_cards_generated += len(formatted_batch) |
|
logger.info( |
|
f"Generated {len(formatted_batch)} cards for topic {topic_info['name']}" |
|
) |
|
|
|
except Exception as e: |
|
logger.error( |
|
f"Error generating cards for topic {topic_info['name']}: {e}", |
|
exc_info=True, |
|
) |
|
|
|
gr.Warning( |
|
f"Could not generate cards for topic '{topic_info['name']}': {str(e)}. Skipping." |
|
) |
|
continue |
|
|
|
|
|
if not flattened_data: |
|
gr.Info( |
|
"No cards were generated." |
|
) |
|
|
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
"No cards generated.", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
|
|
|
|
|
|
|
|
final_cards_data = ( |
|
flattened_data |
|
) |
|
|
|
|
|
|
|
|
|
output_df = pd.DataFrame(final_cards_data, columns=get_dataframe_columns()) |
|
|
|
total_cards_message = f"<div><b>💡 Legacy Generated Cards:</b> <span id='total-cards-count'>{len(output_df)}</span></div>" |
|
|
|
logger.info(f"Legacy orchestration complete. Total cards: {len(output_df)}") |
|
return output_df, total_cards_message |
|
|
|
except Exception as e: |
|
logger.error( |
|
f"Critical error in orchestrate_card_generation: {e}", exc_info=True |
|
) |
|
gr.Error(f"An unexpected error occurred: {str(e)}") |
|
return ( |
|
pd.DataFrame(columns=get_dataframe_columns()), |
|
f"Unexpected error: {str(e)}", |
|
gr.update( |
|
value="<div><b>Total Cards Generated:</b> <span id='total-cards-count'>0</span></div>", |
|
visible=False, |
|
), |
|
) |
|
finally: |
|
|
|
pass |
|
|
|
|
|
|
|
def get_cloze_instruction(generate_cloze: bool) -> str: |
|
if generate_cloze: |
|
return """ |
|
Where appropriate, generate Cloze deletion cards. |
|
- For Cloze cards, set "card_type" to "cloze". |
|
- Format the question field using Anki's cloze syntax (e.g., "The capital of France is {{c1::Paris}}."). |
|
- The "answer" field should contain the full, non-cloze text or specific context for the cloze. |
|
- For standard question/answer cards, set "card_type" to "basic". |
|
""" |
|
return "" |
|
|
|
|
|
|
|
def get_card_json_structure_prompt() -> str: |
|
return """ |
|
{ |
|
"cards": [ |
|
{ |
|
"card_type": "basic or cloze", |
|
"front": { |
|
"question": "question text (potentially with {{{{c1::cloze syntax}}}})" |
|
}, |
|
"back": { |
|
"answer": "concise answer or full text for cloze", |
|
"explanation": "detailed explanation", |
|
"example": "practical example" |
|
}, |
|
"metadata": { |
|
"prerequisites": ["list", "of", "prerequisites"], |
|
"learning_outcomes": ["list", "of", "outcomes"], |
|
"misconceptions": ["list", "of", "misconceptions"], |
|
"difficulty": "beginner/intermediate/advanced" |
|
} |
|
} |
|
// ... more cards |
|
] |
|
} |
|
""" |
|
|
|
|
|
|
|
def process_raw_cards_data(cards_data: list) -> list[Card]: |
|
cards_list = [] |
|
if not isinstance(cards_data, list): |
|
logger.warning( |
|
f"Expected a list of cards, got {type(cards_data)}. Raw data: {cards_data}" |
|
) |
|
return cards_list |
|
|
|
for card_item in cards_data: |
|
if not isinstance(card_item, dict): |
|
logger.warning( |
|
f"Expected card item to be a dict, got {type(card_item)}. Item: {card_item}" |
|
) |
|
continue |
|
try: |
|
|
|
if ( |
|
not all(k in card_item for k in ["front", "back"]) |
|
or not isinstance(card_item["front"], dict) |
|
or not isinstance(card_item["back"], dict) |
|
or "question" not in card_item["front"] |
|
or "answer" not in card_item["back"] |
|
): |
|
logger.warning( |
|
f"Skipping card due to missing essential fields: {card_item}" |
|
) |
|
continue |
|
|
|
card = Card( |
|
card_type=card_item.get("card_type", "basic"), |
|
front=CardFront( |
|
question=strip_html_tags(card_item["front"].get("question", "")) |
|
), |
|
back=CardBack( |
|
answer=strip_html_tags(card_item["back"].get("answer", "")), |
|
explanation=strip_html_tags( |
|
card_item["back"].get("explanation", "") |
|
), |
|
example=strip_html_tags(card_item["back"].get("example", "")), |
|
), |
|
metadata=card_item.get("metadata", {}), |
|
) |
|
cards_list.append(card) |
|
except Exception as e: |
|
logger.error( |
|
f"Error processing card data item: {card_item}. Error: {e}", |
|
exc_info=True, |
|
) |
|
return cards_list |
|
|
|
|
|
|
|
def format_cards_for_dataframe( |
|
cards: list[Card], topic_name: str, topic_index: int = 0, start_index: int = 1 |
|
) -> list: |
|
"""Formats a list of Card objects into a list of dictionaries for DataFrame display. |
|
Ensures all data is plain text. |
|
""" |
|
formatted_cards = [] |
|
for i, card_obj in enumerate(cards): |
|
actual_index = start_index + i |
|
card_type = card_obj.card_type or "basic" |
|
question = card_obj.front.question or "" |
|
answer = card_obj.back.answer or "" |
|
explanation = card_obj.back.explanation or "" |
|
example = card_obj.back.example or "" |
|
|
|
|
|
metadata = card_obj.metadata or {} |
|
prerequisites = metadata.get("prerequisites", []) |
|
learning_outcomes = metadata.get("learning_outcomes", []) |
|
common_misconceptions = metadata.get("misconceptions", []) |
|
difficulty = metadata.get("difficulty", "N/A") |
|
|
|
prerequisites_str = strip_html_tags( |
|
", ".join(prerequisites) |
|
if isinstance(prerequisites, list) |
|
else str(prerequisites) |
|
) |
|
learning_outcomes_str = strip_html_tags( |
|
", ".join(learning_outcomes) |
|
if isinstance(learning_outcomes, list) |
|
else str(learning_outcomes) |
|
) |
|
common_misconceptions_str = strip_html_tags( |
|
", ".join(common_misconceptions) |
|
if isinstance(common_misconceptions, list) |
|
else str(common_misconceptions) |
|
) |
|
difficulty_str = strip_html_tags(str(difficulty)) |
|
|
|
formatted_card = { |
|
"Index": ( |
|
f"{topic_index}.{actual_index}" |
|
if topic_index > 0 |
|
else str(actual_index) |
|
), |
|
"Topic": strip_html_tags(topic_name), |
|
"Card_Type": strip_html_tags(card_type), |
|
"Question": question, |
|
"Answer": answer, |
|
"Explanation": explanation, |
|
"Example": example, |
|
"Prerequisites": prerequisites_str, |
|
"Learning_Outcomes": learning_outcomes_str, |
|
"Common_Misconceptions": common_misconceptions_str, |
|
"Difficulty": difficulty_str, |
|
"Source_URL": strip_html_tags( |
|
metadata.get("source_url", "") |
|
), |
|
} |
|
formatted_cards.append(formatted_card) |
|
return formatted_cards |
|
|
|
|
|
def get_dataframe_columns() -> list[str]: |
|
"""Returns the standard list of columns for the Anki card DataFrame.""" |
|
return [ |
|
"Index", |
|
"Topic", |
|
"Card_Type", |
|
"Question", |
|
"Answer", |
|
"Explanation", |
|
"Example", |
|
"Prerequisites", |
|
"Learning_Outcomes", |
|
"Common_Misconceptions", |
|
"Difficulty", |
|
"Source_URL", |
|
] |
|
|
|
|
|
|
|
|
|
|
|
def deduplicate_cards(cards: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
"""Deduplicates a list of card dictionaries based on the 'Question' field.""" |
|
seen_questions = set() |
|
unique_cards = [] |
|
for card_dict in cards: |
|
question = card_dict.get("Question") |
|
if question is None: |
|
logger.warning(f"Card dictionary missing 'Question' key: {card_dict}") |
|
unique_cards.append(card_dict) |
|
continue |
|
|
|
|
|
normalized_question = " ".join(str(question).strip().lower().split()) |
|
if normalized_question not in seen_questions: |
|
seen_questions.add(normalized_question) |
|
unique_cards.append(card_dict) |
|
else: |
|
logger.info(f"Deduplicated card with question: {question}") |
|
return unique_cards |
|
|
|
|
|
|
|
|
|
|
|
def generate_cards_from_crawled_content( |
|
all_cards: List[Card], |
|
) -> List[Dict[str, Any]]: |
|
""" |
|
Processes a list of Card objects (expected to have plain text fields after generate_cards_batch) |
|
and formats them into a list of dictionaries suitable for the DataFrame. |
|
""" |
|
if not all_cards: |
|
return [] |
|
|
|
data_for_dataframe = [] |
|
for i, card_obj in enumerate(all_cards): |
|
|
|
topic = ( |
|
card_obj.metadata.get("topic", f"Crawled Content - Card {i+1}") |
|
if card_obj.metadata |
|
else f"Crawled Content - Card {i+1}" |
|
) |
|
|
|
|
|
prerequisites = ( |
|
card_obj.metadata.get("prerequisites", []) if card_obj.metadata else [] |
|
) |
|
learning_outcomes = ( |
|
card_obj.metadata.get("learning_outcomes", []) if card_obj.metadata else [] |
|
) |
|
common_misconceptions = ( |
|
card_obj.metadata.get("common_misconceptions", []) |
|
if card_obj.metadata |
|
else [] |
|
) |
|
|
|
prerequisites_str = strip_html_tags( |
|
", ".join(prerequisites) |
|
if isinstance(prerequisites, list) |
|
else str(prerequisites) |
|
) |
|
learning_outcomes_str = strip_html_tags( |
|
", ".join(learning_outcomes) |
|
if isinstance(learning_outcomes, list) |
|
else str(learning_outcomes) |
|
) |
|
common_misconceptions_str = strip_html_tags( |
|
", ".join(common_misconceptions) |
|
if isinstance(common_misconceptions, list) |
|
else str(common_misconceptions) |
|
) |
|
difficulty_str = strip_html_tags( |
|
str( |
|
card_obj.metadata.get("difficulty", "N/A") |
|
if card_obj.metadata |
|
else "N/A" |
|
) |
|
) |
|
|
|
card_dict = { |
|
"Index": str(i + 1), |
|
"Topic": strip_html_tags(topic), |
|
"Card_Type": strip_html_tags(card_obj.card_type or "basic"), |
|
"Question": card_obj.front.question or "", |
|
"Answer": card_obj.back.answer or "", |
|
"Explanation": card_obj.back.explanation or "", |
|
"Example": card_obj.back.example or "", |
|
"Prerequisites": prerequisites_str, |
|
"Learning_Outcomes": learning_outcomes_str, |
|
"Common_Misconceptions": common_misconceptions_str, |
|
"Difficulty": difficulty_str, |
|
"Source_URL": strip_html_tags( |
|
card_obj.metadata.get("source_url", "") if card_obj.metadata else "" |
|
), |
|
} |
|
data_for_dataframe.append(card_dict) |
|
return data_for_dataframe |
|
|
|
|
|
def generate_token_usage_html(token_usage=None): |
|
"""Generate HTML for token usage display""" |
|
if token_usage and isinstance(token_usage, dict): |
|
total_tokens = token_usage.get("total_tokens", 0) |
|
return f"<div style='margin-top: 8px;'><b>Token Usage:</b> {total_tokens} tokens</div>" |
|
else: |
|
return "<div style='margin-top: 8px;'><b>Token Usage:</b> No usage data</div>" |
|
|