Spaces:
Sleeping
Sleeping
import os | |
import json | |
import pandas as pd | |
import gradio as gr | |
import spaces | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import csv | |
import yaml | |
from typing import List, Dict, Any | |
import random | |
from pypdf import PdfReader | |
import re | |
import tempfile | |
from huggingface_hub import HfApi | |
# Configuration | |
DEFAULT_MODEL = "tiiuae/falcon-7b-instruct" # Use Falcon-7B as the default model | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Try to use CUDA if available | |
MAX_NEW_TOKENS = 512 | |
TEMPERATURE = 0.7 | |
HF_TOKEN = os.environ.get("HF_TOKEN") if os.environ.get("HF_TOKEN") else None # Get token from environment variables | |
MAX_RAM_GB = 45 # Set maximum RAM usage to 45GB (below the 70GB limit) | |
# Create offload folder for model memory management | |
os.makedirs("offload_folder", exist_ok=True) | |
# Setup RAM monitoring | |
def get_process_memory_usage(): | |
"""Get the current memory usage of this process in GB""" | |
import psutil | |
process = psutil.Process(os.getpid()) | |
return process.memory_info().rss / (1024 * 1024 * 1024) # Convert to GB | |
class PdfExtractor: | |
"""Extract text content from PDF files""" | |
def extract_text_from_pdf(pdf_file): | |
"""Extract text from a PDF file""" | |
try: | |
reader = PdfReader(pdf_file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
except Exception as e: | |
print(f"Error extracting text from PDF: {e}") | |
return None | |
def clean_text(text): | |
"""Clean and preprocess extracted text""" | |
if not text: | |
return "" | |
# Replace multiple newlines with single newline | |
text = re.sub(r'\n+', '\n', text) | |
# Replace multiple spaces with single space | |
text = re.sub(r'\s+', ' ', text) | |
return text.strip() | |
def chunk_text(text, max_chunk_size=1000, overlap=100): | |
"""Split text into chunks of specified size with overlap""" | |
if not text: | |
return [] | |
chunks = [] | |
start = 0 | |
text_length = len(text) | |
while start < text_length: | |
end = min(start + max_chunk_size, text_length) | |
# If we're not at the end, try to break at a sentence or paragraph | |
if end < text_length: | |
# Look for sentence breaks (period, question mark, exclamation mark followed by space) | |
sentence_break = max( | |
text.rfind('. ', start, end), | |
text.rfind('? ', start, end), | |
text.rfind('! ', start, end), | |
text.rfind('\n', start, end) | |
) | |
if sentence_break > start + max_chunk_size // 2: | |
end = sentence_break + 1 | |
chunks.append(text[start:end].strip()) | |
start = end - overlap # Create overlap with previous chunk | |
return chunks | |
class SyntheticDataGenerator: | |
def __init__(self, model_name=DEFAULT_MODEL): | |
self.model_name = model_name | |
self.model = None | |
self.tokenizer = None | |
self.load_model() # Load the model directly during initialization | |
def load_model(self): | |
"""Load the specified model.""" | |
# Clear CUDA cache if using GPU to prevent memory fragmentation | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
try: | |
print(f"Loading model {self.model_name} on {DEVICE}...") | |
# Add token for authentication if available | |
tokenizer_kwargs = {} | |
model_kwargs = { | |
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32, | |
"device_map": "auto" if torch.cuda.is_available() else None, | |
"low_cpu_mem_usage": True, # Added to reduce memory usage on CPU | |
"offload_folder": "offload_folder" # Add offload folder for large models | |
} | |
if HF_TOKEN: | |
tokenizer_kwargs["token"] = HF_TOKEN | |
model_kwargs["token"] = HF_TOKEN | |
print("Using Hugging Face token for authentication") | |
# Load tokenizer | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, **tokenizer_kwargs) | |
# Load the model | |
self.model = AutoModelForCausalLM.from_pretrained( | |
self.model_name, | |
**model_kwargs | |
) | |
# Ensure model is on the right device if not using device_map="auto" | |
if not torch.cuda.is_available(): | |
self.model = self.model.to(DEVICE) | |
print(f"Model {self.model_name} loaded successfully on {DEVICE}") | |
except Exception as e: | |
print(f"Error loading model {self.model_name}: {e}") | |
self.model = None | |
self.tokenizer = None | |
raise | |
def generate_qa_prompt(self, context, num_questions=3, include_tags=True, difficulty_levels=True): | |
"""Generate a prompt for creating Q&A pairs from context.""" | |
tag_instruction = "" | |
if include_tags: | |
tag_instruction = "Add 1-3 tags for each question that categorize the topic or subject matter." | |
difficulty_instruction = "" | |
if difficulty_levels: | |
difficulty_instruction = "For each question, assign a difficulty level (easy, medium, or hard)." | |
prompt = f"""Task: Based on the following text, generate {num_questions} question and answer pairs that would be useful for comprehension testing or knowledge assessment. | |
CONTEXT: | |
{context} | |
For each question: | |
1. Write a clear, specific question about the information in the text | |
2. Provide the correct answer to the question, citing relevant details from the text | |
3. {tag_instruction} | |
4. {difficulty_instruction} | |
Format each Q&A pair as a JSON object with the following structure: | |
{{ | |
"question": "The question text", | |
"answer": "The answer text", | |
"tags": ["tag1", "tag2"], | |
"difficulty": "easy/medium/hard" | |
}} | |
Return all Q&A pairs in a JSON array. | |
""" | |
return prompt | |
def generate_data(self, prompt, num_samples=1): | |
"""Generate synthetic data using the loaded model.""" | |
if not self.model or not self.tokenizer: | |
return ["Error: Model not loaded properly. Please try again with a different model."] | |
outputs = [] | |
for sample_idx in range(num_samples): | |
try: | |
# Clear CUDA cache before generating to free up memory | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
# ZeroGPU errors often occur in generate() calls | |
# To mitigate this, try multiple approaches in sequence | |
inputs = self.tokenizer(prompt, return_tensors="pt").to(DEVICE) | |
try: | |
# First try: Standard generation with conservative settings | |
with torch.no_grad(): | |
output = self.model.generate( | |
**inputs, | |
max_new_tokens=MAX_NEW_TOKENS, | |
temperature=TEMPERATURE, | |
do_sample=True, | |
pad_token_id=self.tokenizer.eos_token_id, | |
num_beams=1, # Use greedy decoding instead of beam search | |
early_stopping=True, | |
no_repeat_ngram_size=3 # Prevent repetition | |
) | |
decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True) | |
except (RuntimeError, Exception) as e: | |
if "CUDA" in str(e) or "GPU" in str(e) or "ZeroGPU" in str(e): | |
print(f"GPU error during generation: {e}") | |
print("Falling back to CPU generation...") | |
# Move everything to CPU | |
inputs = {k: v.to('cpu') for k, v in inputs.items()} | |
# Create CPU copy of the model if we were using GPU | |
if torch.cuda.is_available(): | |
# Temporarily move model to CPU for this generation | |
model_cpu = self.model.to('cpu') | |
with torch.no_grad(): | |
output = model_cpu.generate( | |
**inputs, | |
max_new_tokens=MAX_NEW_TOKENS, | |
temperature=TEMPERATURE, | |
do_sample=True, | |
pad_token_id=self.tokenizer.eos_token_id, | |
num_return_sequences=1, | |
max_length=MAX_NEW_TOKENS + inputs['input_ids'].shape[1] | |
) | |
decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True) | |
# Move model back to CUDA for future calls | |
self.model = self.model.to(DEVICE) | |
else: | |
# Already on CPU, try with reduced parameters | |
with torch.no_grad(): | |
output = self.model.generate( | |
**inputs, | |
max_new_tokens=min(256, MAX_NEW_TOKENS), # Reduce token count | |
temperature=0.5, # Lower temperature | |
do_sample=False, # No sampling | |
num_return_sequences=1, | |
pad_token_id=self.tokenizer.eos_token_id | |
) | |
decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True) | |
else: | |
# Re-raise non-CUDA errors | |
raise | |
# Extract only the generated part (remove prompt) | |
prompt_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True) | |
generated_text = decoded_output[len(prompt_text):].strip() | |
outputs.append(generated_text) | |
# Clear CUDA cache between samples | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
except Exception as e: | |
error_msg = f"Error generating sample {sample_idx+1}: {str(e)}" | |
print(error_msg) | |
outputs.append(f"Error: {error_msg}") | |
return outputs | |
def parse_json_data(self, generated_text): | |
"""Extract and parse JSON from generated text.""" | |
try: | |
# Find JSON-like content (between [ and ]) | |
start_idx = generated_text.find('[') | |
end_idx = generated_text.rfind(']') + 1 | |
if start_idx >= 0 and end_idx > start_idx: | |
json_str = generated_text[start_idx:end_idx] | |
return json.loads(json_str) | |
# Try to find single object format | |
start_idx = generated_text.find('{') | |
end_idx = generated_text.rfind('}') + 1 | |
if start_idx >= 0 and end_idx > start_idx: | |
json_str = generated_text[start_idx:end_idx] | |
return json.loads(json_str) | |
print(f"Could not find JSON content in: {generated_text}") | |
return None | |
except json.JSONDecodeError as e: | |
print(f"JSON parse error: {e}") | |
print(f"Problematic text: {generated_text}") | |
# Try to find and fix common JSON formatting errors | |
try: | |
# Replace single quotes with double quotes | |
json_str = generated_text[start_idx:end_idx].replace("'", "\"") | |
return json.loads(json_str) | |
except: | |
pass | |
# If still failing, try to extract individual JSON objects | |
try: | |
pattern = r'\{[^{}]*\}' | |
matches = re.findall(pattern, generated_text) | |
if matches: | |
results = [] | |
for match in matches: | |
try: | |
# Replace single quotes with double quotes | |
fixed_match = match.replace("'", "\"") | |
obj = json.loads(fixed_match) | |
results.append(obj) | |
except: | |
continue | |
if results: | |
return results | |
except: | |
pass | |
return None | |
def generate_qa_from_pdf_chunk(self, chunk, num_questions=3, include_tags=True, difficulty_levels=True): | |
"""Generate Q&A pairs from a PDF text chunk.""" | |
if not self.model or not self.tokenizer: | |
return [], "Error: Model not loaded properly. Please try again with a different model." | |
if not chunk or len(chunk.strip()) < 100: # Skip very small chunks | |
return [], "Chunk too small to generate meaningful Q&A pairs." | |
prompt = self.generate_qa_prompt(chunk, num_questions, include_tags, difficulty_levels) | |
raw_outputs = self.generate_data(prompt, num_samples=1) | |
raw_output = raw_outputs[0] | |
parsed_data = self.parse_json_data(raw_output) | |
# Ensure parsed data is a list | |
if parsed_data and isinstance(parsed_data, dict): | |
parsed_data = [parsed_data] | |
# Return both the parsed data and raw output for debugging | |
return parsed_data, raw_output | |
def format_data_preview(data): | |
"""Format the data for preview in the UI.""" | |
if isinstance(data, list): | |
if len(data) > 0 and isinstance(data[0], dict): | |
# Convert list of dicts to DataFrame for better display | |
return pd.DataFrame(data).to_string() | |
else: | |
return json.dumps(data, indent=2) | |
elif isinstance(data, dict): | |
return json.dumps(data, indent=2) | |
else: | |
return str(data) | |
def save_data(data, format, filename_prefix): | |
"""Save data to a file in the specified format.""" | |
os.makedirs("synthetic_data", exist_ok=True) | |
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S") | |
filename = f"synthetic_data/{filename_prefix}_{timestamp}" | |
if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict): | |
df = pd.DataFrame(data) | |
if format.lower() == "csv": | |
full_filename = f"{filename}.csv" | |
df.to_csv(full_filename, index=False) | |
elif format.lower() == "json": | |
full_filename = f"{filename}.json" | |
with open(full_filename, "w") as f: | |
json.dump(data, f, indent=2) | |
elif format.lower() == "excel": | |
full_filename = f"{filename}.xlsx" | |
df.to_excel(full_filename, index=False) | |
else: | |
full_filename = f"{filename}.txt" | |
with open(full_filename, "w") as f: | |
f.write(str(data)) | |
else: | |
full_filename = f"{filename}.{format.lower()}" | |
with open(full_filename, "w") as f: | |
if format.lower() == "json": | |
json.dump(data, f, indent=2) | |
else: | |
f.write(str(data)) | |
return full_filename | |
def load_models(): | |
"""Return a list of available models.""" | |
return [ | |
"tiiuae/falcon-7b-instruct" | |
] | |
def process_pdf_generate_qa(pdf_file, model_name, num_questions_per_chunk, include_tags, include_difficulty, output_file_format, progress=None): | |
"""Process a PDF file and generate Q&A pairs from its content.""" | |
if pdf_file is None: | |
return None, "Error: No PDF file uploaded", "", "No file provided" | |
try: | |
# Check RAM usage at start | |
current_ram_usage = get_process_memory_usage() | |
print(f"Starting RAM usage: {current_ram_usage:.2f}GB") | |
# Clear CUDA cache before starting | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
# Initialize extractor and generator | |
extractor = PdfExtractor() | |
generator = SyntheticDataGenerator(model_name) | |
# Wrap model loading in try-except to handle errors | |
try: | |
load_success = generator.load_model() | |
if not load_success: | |
return None, "Error: Failed to load the model. Please try again with a different model.", "", "Model loading failed" | |
except Exception as e: | |
if "ZeroGPU" in str(e) or "GPU task aborted" in str(e) or "CUDA" in str(e): | |
print(f"GPU error during model loading: {e}. Trying with a smaller model...") | |
# If we get a ZeroGPU error, immediately try the smallest model | |
generator.model_name = "tiiuae/falcon-7b-instruct" # Use default model as fallback | |
load_success = generator.load_model() | |
if not load_success: | |
return None, "Error: Failed to load any model even after fallback. Please try again later.", "", "Model loading failed" | |
else: | |
# Re-raise other errors | |
raise | |
# Check RAM usage after model loading | |
ram_after_model = get_process_memory_usage() | |
print(f"RAM usage after model loading: {ram_after_model:.2f}GB") | |
# Save PDF temporarily if it's a file object | |
if hasattr(pdf_file, 'name'): | |
# It's already a file path | |
pdf_path = pdf_file.name | |
else: | |
# Create a temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: | |
tmp.write(pdf_file) | |
pdf_path = tmp.name | |
# Extract text from PDF | |
pdf_text = extractor.extract_text_from_pdf(pdf_path) | |
if not pdf_text: | |
return None, "Failed to extract text from PDF", "", "No data generated" | |
# Clean and chunk the text - reduce chunk size to use less memory | |
cleaned_text = extractor.clean_text(pdf_text) | |
chunks = extractor.chunk_text(cleaned_text, max_chunk_size=400, overlap=30) | |
# Check RAM after PDF processing | |
ram_after_pdf = get_process_memory_usage() | |
print(f"RAM usage after PDF processing: {ram_after_pdf:.2f}GB, found {len(chunks)} chunks") | |
# If we're approaching the RAM limit already, reduce batch size | |
batch_size = 3 # Default | |
if ram_after_pdf > MAX_RAM_GB * 0.7: # If already using 70% of our limit | |
batch_size = 1 # Process one chunk at a time | |
print(f"High RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 1") | |
elif ram_after_pdf > MAX_RAM_GB * 0.5: # If using 50% of our limit | |
batch_size = 2 # Process two chunks at a time | |
print(f"Moderate RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 2") | |
# Generate Q&A pairs for each chunk | |
all_qa_pairs = [] | |
all_raw_outputs = [] | |
total_chunks = len(chunks) | |
# Process chunks in smaller batches to avoid memory buildup | |
for i in range(0, total_chunks, batch_size): | |
# Get the current batch of chunks | |
batch_chunks = chunks[i:min(i+batch_size, total_chunks)] | |
# Process each chunk in the batch | |
for j, chunk in enumerate(batch_chunks): | |
chunk_index = i + j | |
if progress is not None: | |
progress(chunk_index / total_chunks, f"Processing chunk {chunk_index+1}/{total_chunks}") | |
# Check if we're approaching RAM limit | |
current_ram = get_process_memory_usage() | |
if current_ram > MAX_RAM_GB * 0.9: # Over 90% of our limit | |
print(f"WARNING: High RAM usage detected: {current_ram:.2f}GB - force releasing memory") | |
import gc | |
gc.collect() # Force garbage collection | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
# If still too high after garbage collection, abort batch processing | |
current_ram = get_process_memory_usage() | |
if current_ram > MAX_RAM_GB * 0.95: # Still dangerously high | |
print(f"CRITICAL: RAM usage too high ({current_ram:.2f}GB), stopping processing") | |
break | |
# Clear CUDA cache between chunks | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
try: | |
qa_pairs, raw_output = generator.generate_qa_from_pdf_chunk( | |
chunk, | |
num_questions=num_questions_per_chunk, | |
include_tags=include_tags, | |
difficulty_levels=include_difficulty | |
) | |
except Exception as e: | |
error_type = str(e) | |
if "CUDA" in error_type or "GPU" in error_type or "ZeroGPU" in error_type: | |
print(f"GPU error during generation for chunk {chunk_index+1}: {e}") | |
# Fall back to CPU for this specific generation | |
raw_output = f"Error in chunk {chunk_index+1}: {str(e)}. Skipping..." | |
qa_pairs = None | |
elif "memory" in error_type.lower() or "ram" in error_type.lower(): | |
print(f"Memory error processing chunk {chunk_index+1}: {e}") | |
# Force garbage collection and skip chunk | |
import gc | |
gc.collect() | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
raw_output = f"Memory error in chunk {chunk_index+1}: {str(e)}. Skipping..." | |
qa_pairs = None | |
else: | |
# For other errors, just log and continue | |
print(f"Error processing chunk {chunk_index+1}: {e}") | |
raw_output = f"Error in chunk {chunk_index+1}: {str(e)}" | |
qa_pairs = None | |
if qa_pairs: | |
all_qa_pairs.extend(qa_pairs) | |
all_raw_outputs.append(raw_output) | |
# Check RAM usage after processing this chunk | |
current_ram = get_process_memory_usage() | |
print(f"RAM after chunk {chunk_index+1}: {current_ram:.2f}GB") | |
# Do a thorough cleanup after each batch | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
# Force garbage collection between batches | |
import gc | |
gc.collect() | |
# Check if we need to abort due to memory constraints | |
current_ram = get_process_memory_usage() | |
if current_ram > MAX_RAM_GB: | |
print(f"WARNING: Exceeding RAM limit ({current_ram:.2f}GB). Stopping further processing.") | |
if progress is not None: | |
progress(1.0, f"Stopped early due to high memory usage ({current_ram:.2f}GB)") | |
break | |
if progress is not None: | |
progress(1.0, "Finished processing") | |
# Final cache clear and garbage collection | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
import gc | |
gc.collect() | |
if not all_qa_pairs: | |
return None, "Failed to generate Q&A pairs", "\n\n".join(all_raw_outputs), "No data generated" | |
# Save data to file | |
filename = save_data( | |
all_qa_pairs, | |
output_file_format, | |
"qa_dataset" | |
) | |
# Format for display | |
formatted_data = format_data_preview(all_qa_pairs) | |
# Final memory report | |
final_ram = get_process_memory_usage() | |
print(f"Final RAM usage: {final_ram:.2f}GB") | |
return all_qa_pairs, formatted_data, "\n\n".join(all_raw_outputs), f"Data saved to {filename}" | |
except Exception as e: | |
error_msg = f"Error processing PDF: {str(e)}" | |
print(error_msg) | |
import traceback | |
print(traceback.format_exc()) | |
return None, error_msg, "", "Processing failed" | |
# Set up the Gradio interface | |
def create_interface(): | |
with gr.Blocks(title="PDF Q&A Dataset Generator") as app: | |
gr.Markdown("# 📚 PDF Q&A Dataset Generator") | |
gr.Markdown(""" | |
Generate question & answer datasets from PDF documents using instruction-tuned language models. | |
Perfect for creating educational resources, quiz materials, or training data for Q&A systems. | |
""") | |
with gr.Tabs() as tabs: | |
with gr.TabItem("Generate Q&A Dataset"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
pdf_file = gr.File( | |
label="Upload PDF", | |
file_types=[".pdf"], | |
type="binary" | |
) | |
model_dropdown = gr.Dropdown( | |
choices=load_models(), | |
value=DEFAULT_MODEL, | |
label="Model" | |
) | |
num_questions = gr.Slider( | |
minimum=1, | |
maximum=5, | |
value=3, | |
step=1, | |
label="Questions per Section" | |
) | |
include_tags = gr.Checkbox( | |
value=True, | |
label="Include Tags" | |
) | |
include_difficulty = gr.Checkbox( | |
value=True, | |
label="Include Difficulty Levels" | |
) | |
output_file_format = gr.Radio( | |
choices=["json", "csv", "excel"], | |
value="json", | |
label="Save File Format" | |
) | |
generate_btn = gr.Button("Generate Q&A Dataset", variant="primary") | |
progress_bar = gr.Progress() | |
with gr.Column(scale=2): | |
with gr.Tab("Parsed Data"): | |
parsed_data_output = gr.JSON(label="Generated Q&A Pairs") | |
formatted_data_output = gr.Textbox( | |
label="Formatted Preview", | |
lines=15 | |
) | |
with gr.Tab("Raw Output"): | |
raw_output = gr.Textbox( | |
label="Raw Model Output", | |
lines=15 | |
) | |
file_output = gr.Textbox(label="File Output") | |
with gr.TabItem("Documentation"): | |
gr.Markdown(""" | |
## How to Use | |
1. **Upload a PDF**: Select a PDF document containing the content you want to generate questions from. | |
2. **Select a model**: Choose an instruction-tuned language model from the dropdown. | |
3. **Configure settings**: | |
- Set the number of questions to generate per text section | |
- Choose whether to include tags and difficulty levels | |
- Select your preferred output file format | |
4. **Generate dataset**: Click the "Generate Q&A Dataset" button to create your dataset. | |
## About This App | |
This app uses instruction-tuned language models to generate question and answer pairs from PDF documents. It: | |
1. Extracts text from the uploaded PDF | |
2. Splits the text into manageable chunks | |
3. Generates questions, answers, tags, and difficulty levels for each chunk | |
4. Combines all Q&A pairs into a comprehensive dataset | |
### Features: | |
- Automatic text extraction from PDFs | |
- Smart text chunking to maintain context | |
- Customizable number of questions per chunk | |
- Optional tagging and difficulty classification | |
- Multiple output formats (JSON, CSV, Excel) | |
### Use Cases: | |
- Create educational resources and quiz materials | |
- Generate training data for Q&A systems | |
- Build flashcard datasets for studying | |
- Develop content for educational applications | |
""") | |
with gr.TabItem("Status"): | |
gr.Markdown(""" | |
## System Status | |
This app runs on CPU mode. Some larger models might be slower to load and generate content. | |
If you encounter any issues with a specific model, try switching to a smaller model like `tiiuae/falcon-7b-instruct`. | |
### Troubleshooting | |
- If the app seems unresponsive after clicking "Generate", please be patient - model loading may take time. | |
- If you get an error about model loading, try refreshing the page and selecting a different model. | |
- Not all PDFs can be properly processed - if text extraction fails, try with a different PDF. | |
""") | |
# Event handler for generate button | |
generate_btn.click( | |
process_pdf_generate_qa, | |
inputs=[ | |
pdf_file, | |
model_dropdown, | |
num_questions, | |
include_tags, | |
include_difficulty, | |
output_file_format | |
], | |
outputs=[parsed_data_output, formatted_data_output, raw_output, file_output], | |
show_progress=True | |
) | |
return app | |
# Export the app for Hugging Face Spaces | |
app = create_interface() | |
# Launch the app depending on the environment | |
if __name__ == "__main__": | |
app.launch() | |