Spaces:
Sleeping
Sleeping
import random | |
import csv | |
import os | |
import logging | |
import hashlib | |
import json | |
import re | |
from typing import List, Dict | |
from datetime import datetime | |
from mistralai.client import MistralClient | |
from mistralai.models.chat_completion import ChatMessage | |
# Set up logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Get the Mistral API key from environment variables | |
api_key = os.environ.get("MISTRAL_API_KEY") | |
if not api_key: | |
logging.error("MISTRAL_API_KEY environment variable is not set.") | |
raise ValueError("MISTRAL_API_KEY environment variable is not set.") | |
model = "mistral-large-latest" | |
# Initialize Mistral client | |
client = MistralClient(api_key=api_key) | |
# Load data from CSV files | |
def load_csv_data(file_path: str) -> List[Dict[str, str]]: | |
"""Load data from a CSV file.""" | |
logging.info(f"Loading data from {file_path}...") | |
try: | |
with open(file_path, 'r', encoding='utf-8') as csvfile: | |
reader = csv.DictReader(csvfile) | |
data = list(reader) | |
logging.info(f"Loaded {len(data)} rows from {file_path}") | |
return data | |
except FileNotFoundError: | |
logging.error(f"File not found: {file_path}") | |
raise | |
except csv.Error as e: | |
logging.error(f"Error reading CSV file {file_path}: {e}") | |
raise | |
# Load data from both CSV files | |
try: | |
detailed_cases = load_csv_data('processed_medical_history.csv') | |
infectious_diseases = load_csv_data('infectious_diseases.csv') | |
except Exception as e: | |
logging.error(f"Failed to load CSV data: {e}") | |
raise | |
def hash_question(question: str) -> str: | |
"""Generate a hash for a question to check for duplicates.""" | |
return hashlib.md5(question.encode()).hexdigest() | |
def load_generated_questions() -> set: | |
"""Load previously generated question hashes from a file.""" | |
try: | |
with open('generated_questions.txt', 'r') as f: | |
return set(line.strip() for line in f) | |
except FileNotFoundError: | |
return set() | |
def save_generated_question(question_hash: str): | |
"""Save a newly generated question hash to the file.""" | |
with open('generated_questions.txt', 'a') as f: | |
f.write(question_hash + '\n') | |
generated_questions = load_generated_questions() | |
def extract_json_from_markdown(markdown_text: str) -> str: | |
"""Extract JSON content from Markdown-formatted text.""" | |
json_match = re.search(r'```json\s*(.*?)\s*```', markdown_text, re.DOTALL) | |
if json_match: | |
return json_match.group(1) | |
else: | |
raise ValueError("No JSON content found in the Markdown text") | |
def generate_microbiology_question() -> Dict[str, str]: | |
"""Generate a microbiology question.""" | |
question_types = [ | |
"clinical_vignette", | |
"mechanism_of_pathogenesis", | |
"laboratory_diagnosis", | |
"antimicrobial_resistance", | |
"vaccine_preventable_disease", | |
"microbial_physiology_genetics", | |
"epidemiology_transmission" | |
] | |
question_type = random.choice(question_types) | |
logging.info(f"Generating {question_type} question...") | |
if question_type == "clinical_vignette": | |
case = random.choice(detailed_cases) | |
context = f""" | |
Pathogen: {case['Pathogen_Name']} ({case['Pathogen_Type']}) | |
Key Symptoms: {case['Key_Symptoms']} | |
Physical Findings: {case['Physical_Findings']} | |
Lab Results: {case['Lab_Results']} | |
Patient Demographics: {case['Patient_Demographics']} | |
""" | |
else: | |
disease = random.choice(infectious_diseases) | |
context = f""" | |
Infectious Agent: {disease['infectious_agent']} | |
Diagnosis: {disease['diagnosis']} | |
Treatment: {disease['treatment']} | |
""" | |
prompt = f""" | |
Create a microbiology question that could appear on the NBME exam. This should be a {question_type} question. | |
Use the following information as inspiration, but feel free to expand or modify: | |
{context} | |
Generate a question based on the following template, depending on the question type: | |
1. Clinical Vignette with Pathogen Identification: | |
A [age]-year-old [gender] presents with [symptoms and clinical findings]. [Additional relevant information]. Which of the following is the most likely causal organism? | |
2. Mechanism of Pathogenesis: | |
[Description of a pathogen or clinical scenario] | |
Which of the following best describes the mechanism by which this organism causes disease? | |
3. Laboratory Diagnosis: | |
A patient presents with [symptoms]. [Description of laboratory findings or test results]. | |
Which of the following is the most likely diagnosis based on these laboratory findings? | |
4. Antimicrobial Mechanism and Resistance: | |
A patient is diagnosed with [infection]. The causative organism is found to be resistant to [antibiotic]. Which of the following mechanisms is most likely responsible for this resistance? | |
5. Vaccine-Preventable Disease: | |
A [age]-year-old [gender] presents with [symptoms of a vaccine-preventable disease]. Which of the following vaccines would have been most likely to prevent this condition? | |
6. Microbial Physiology and Genetics: | |
An investigator observes [description of microbial behavior or genetic phenomenon]. Which of the following best explains this observation? | |
7. Epidemiology and Transmission: | |
A cluster of [disease] cases is reported in [location]. [Description of affected population and circumstances]. Which of the following is the most likely mode of transmission? | |
Include: | |
1. The question based on the selected template | |
2. Five possible answer options (A through E) | |
3. The correct answer | |
4. A brief explanation of why the correct answer is right and why the other options are incorrect | |
5. Detailed medical reasoning for the correct answer, including relevant pathophysiology, microbiology concepts, and clinical implications. | |
Format the response as a JSON object with the following keys: | |
{{ | |
"question": "The question text", | |
"options": {{ | |
"A": "Option A text", | |
"B": "Option B text", | |
"C": "Option C text", | |
"D": "Option D text", | |
"E": "Option E text" | |
}}, | |
"correct_answer": "The letter of the correct answer (A, B, C, D, or E)", | |
"explanation": "The explanation text", | |
"medical_reasoning": "The detailed medical reasoning text" | |
}} | |
""" | |
try: | |
chat_response = client.chat( | |
model=model, | |
messages=[ | |
ChatMessage(role="system", content="You are a medical educator creating unique microbiology questions for the NBME exam. Ensure each question is distinct from previously generated ones and follows the specified template."), | |
ChatMessage(role="user", content=prompt) | |
] | |
) | |
response_content = chat_response.choices[0].message.content | |
logging.info(f"Received response from Mistral API: {response_content[:100]}...") # Log first 100 characters | |
# Extract JSON from Markdown if necessary | |
try: | |
json_content = extract_json_from_markdown(response_content) | |
except ValueError: | |
json_content = response_content # If not in Markdown, use the original content | |
# Parse the JSON response | |
question_data = json.loads(json_content) | |
# Validate the structure of the parsed JSON | |
required_keys = ["question", "options", "correct_answer", "explanation", "medical_reasoning"] | |
if not all(key in question_data for key in required_keys): | |
raise ValueError("Response is missing required keys") | |
if not all(key in question_data["options"] for key in ["A", "B", "C", "D", "E"]): | |
raise ValueError("Response is missing required option keys") | |
# Save the question hash | |
question_hash = hash_question(question_data['question']) | |
if question_hash not in generated_questions: | |
generated_questions.add(question_hash) | |
save_generated_question(question_hash) | |
return question_data | |
except json.JSONDecodeError as e: | |
logging.error(f"Failed to parse JSON response: {e}") | |
logging.error(f"Response content: {response_content}") | |
raise | |
except ValueError as e: | |
logging.error(f"Invalid response structure: {e}") | |
logging.error(f"Response content: {response_content}") | |
raise | |
except Exception as e: | |
logging.error(f"An unexpected error occurred: {e}") | |
raise | |
# Example usage | |
if __name__ == "__main__": | |
question = generate_microbiology_question() | |
print(json.dumps(question, indent=2)) |