Spaces:

ambrosfitz
/

md-qa-test

Sleeping

App Files Files Community

ambrosfitz commited on Oct 13, 2024

Commit

afaec0f

verified ·

1 Parent(s): b5ae83c

Update question_generator.py

Browse files

Files changed (1) hide show

question_generator.py +17 -125

question_generator.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import logging
 import hashlib
 import json
 from typing import List, Dict
 from datetime import datetime
 from mistralai.client import MistralClient
@@ -23,134 +24,19 @@ model = "mistral-large-latest"
 # Initialize Mistral client
 client = MistralClient(api_key=api_key)
-def load_csv_data(file_path: str) -> List[Dict[str, str]]:
-    """Load data from a CSV file."""
-    logging.info(f"Loading data from {file_path}...")
-    try:
-        with open(file_path, 'r', encoding='utf-8') as csvfile:
-            reader = csv.DictReader(csvfile)
-            data = list(reader)
-        logging.info(f"Loaded {len(data)} rows from {file_path}")
-        return data
-    except FileNotFoundError:
-        logging.error(f"File not found: {file_path}")
-        raise
-    except csv.Error as e:
-        logging.error(f"Error reading CSV file {file_path}: {e}")
-        raise
-# Load data from both CSV files
-try:
-    detailed_cases = load_csv_data('processed_medical_history.csv')
-    infectious_diseases = load_csv_data('infectious_diseases.csv')
-except Exception as e:
-    logging.error(f"Failed to load CSV data: {e}")
-    raise
-def hash_question(question: str) -> str:
-    """Generate a hash for a question to check for duplicates."""
-    return hashlib.md5(question.encode()).hexdigest()
-def load_generated_questions() -> set:
-    """Load previously generated question hashes from a file."""
-    try:
-        with open('generated_questions.txt', 'r') as f:
-            return set(line.strip() for line in f)
-    except FileNotFoundError:
-        return set()
-def save_generated_question(question_hash: str):
-    """Save a newly generated question hash to the file."""
-    with open('generated_questions.txt', 'a') as f:
-        f.write(question_hash + '\n')
-generated_questions = load_generated_questions()
 def generate_microbiology_question() -> Dict[str, str]:
     """Generate a microbiology question."""
-    question_types = [
-        "clinical_vignette",
-        "mechanism_of_pathogenesis",
-        "laboratory_diagnosis",
-        "antimicrobial_resistance",
-        "vaccine_preventable_disease",
-        "microbial_physiology_genetics",
-        "epidemiology_transmission"
-    ]
-    question_type = random.choice(question_types)
-    logging.info(f"Generating {question_type} question...")
-    if question_type == "clinical_vignette":
-        case = random.choice(detailed_cases)
-        context = f"""
-        Pathogen: {case['Pathogen_Name']} ({case['Pathogen_Type']})
-        Key Symptoms: {case['Key_Symptoms']}
-        Physical Findings: {case['Physical_Findings']}
-        Lab Results: {case['Lab_Results']}
-        Patient Demographics: {case['Patient_Demographics']}
-        """
-    else:
-        disease = random.choice(infectious_diseases)
-        context = f"""
-        Infectious Agent: {disease['infectious_agent']}
-        Diagnosis: {disease['diagnosis']}
-        Treatment: {disease['treatment']}
-        """
-    prompt = f"""
-    Create a microbiology question that could appear on the NBME exam. This should be a {question_type} question.
-    Use the following information as inspiration, but feel free to expand or modify:
-    {context}
-    Generate a question based on the following template, depending on the question type:
-    1. Clinical Vignette with Pathogen Identification:
-    A [age]-year-old [gender] presents with [symptoms and clinical findings]. [Additional relevant information]. Which of the following is the most likely causal organism?
-    2. Mechanism of Pathogenesis:
-    [Description of a pathogen or clinical scenario]
-    Which of the following best describes the mechanism by which this organism causes disease?
-    3. Laboratory Diagnosis:
-    A patient presents with [symptoms]. [Description of laboratory findings or test results].
-    Which of the following is the most likely diagnosis based on these laboratory findings?
-    4. Antimicrobial Mechanism and Resistance:
-    A patient is diagnosed with [infection]. The causative organism is found to be resistant to [antibiotic]. Which of the following mechanisms is most likely responsible for this resistance?
-    5. Vaccine-Preventable Disease:
-    A [age]-year-old [gender] presents with [symptoms of a vaccine-preventable disease]. Which of the following vaccines would have been most likely to prevent this condition?
-    6. Microbial Physiology and Genetics:
-    An investigator observes [description of microbial behavior or genetic phenomenon]. Which of the following best explains this observation?
-    7. Epidemiology and Transmission:
-    A cluster of [disease] cases is reported in [location]. [Description of affected population and circumstances]. Which of the following is the most likely mode of transmission?
-    Include:
-    1. The question based on the selected template
-    2. Five possible answer options (A through E)
-    3. The correct answer
-    4. A brief explanation of why the correct answer is right and why the other options are incorrect
-    5. Detailed medical reasoning for the correct answer, including relevant pathophysiology, microbiology concepts, and clinical implications.
-    Format the response as a JSON object with the following keys:
-    {{
-        "question": "The question text",
-        "options": {{
-            "A": "Option A text",
-            "B": "Option B text",
-            "C": "Option C text",
-            "D": "Option D text",
-            "E": "Option E text"
-        }},
-        "correct_answer": "The letter of the correct answer (A, B, C, D, or E)",
-        "explanation": "The explanation text",
-        "medical_reasoning": "The detailed medical reasoning text"
-    }}
-    """
     try:
         chat_response = client.chat(
@@ -164,8 +50,14 @@ def generate_microbiology_question() -> Dict[str, str]:
         response_content = chat_response.choices[0].message.content
         logging.info(f"Received response from Mistral API: {response_content[:100]}...")  # Log first 100 characters
         # Parse the JSON response
-        question_data = json.loads(response_content)
         # Validate the structure of the parsed JSON
         required_keys = ["question", "options", "correct_answer", "explanation", "medical_reasoning"]

 import logging
 import hashlib
 import json
+import re
 from typing import List, Dict
 from datetime import datetime
 from mistralai.client import MistralClient
 # Initialize Mistral client
 client = MistralClient(api_key=api_key)
+# ... (previous functions remain the same)
+def extract_json_from_markdown(markdown_text: str) -> str:
+    """Extract JSON content from Markdown-formatted text."""
+    json_match = re.search(r'```json\s*(.*?)\s*```', markdown_text, re.DOTALL)
+    if json_match:
+        return json_match.group(1)
+    else:
+        raise ValueError("No JSON content found in the Markdown text")
 def generate_microbiology_question() -> Dict[str, str]:
     """Generate a microbiology question."""
+    # ... (previous code remains the same)
     try:
         chat_response = client.chat(
         response_content = chat_response.choices[0].message.content
         logging.info(f"Received response from Mistral API: {response_content[:100]}...")  # Log first 100 characters
+        # Extract JSON from Markdown if necessary
+        try:
+            json_content = extract_json_from_markdown(response_content)
+        except ValueError:
+            json_content = response_content  # If not in Markdown, use the original content
         # Parse the JSON response
+        question_data = json.loads(json_content)
         # Validate the structure of the parsed JSON
         required_keys = ["question", "options", "correct_answer", "explanation", "medical_reasoning"]