Spaces:
Sleeping
Sleeping
ambrosfitz
commited on
Commit
•
afaec0f
1
Parent(s):
b5ae83c
Update question_generator.py
Browse files- question_generator.py +17 -125
question_generator.py
CHANGED
@@ -4,6 +4,7 @@ import os
|
|
4 |
import logging
|
5 |
import hashlib
|
6 |
import json
|
|
|
7 |
from typing import List, Dict
|
8 |
from datetime import datetime
|
9 |
from mistralai.client import MistralClient
|
@@ -23,134 +24,19 @@ model = "mistral-large-latest"
|
|
23 |
# Initialize Mistral client
|
24 |
client = MistralClient(api_key=api_key)
|
25 |
|
26 |
-
|
27 |
-
"""Load data from a CSV file."""
|
28 |
-
logging.info(f"Loading data from {file_path}...")
|
29 |
-
try:
|
30 |
-
with open(file_path, 'r', encoding='utf-8') as csvfile:
|
31 |
-
reader = csv.DictReader(csvfile)
|
32 |
-
data = list(reader)
|
33 |
-
logging.info(f"Loaded {len(data)} rows from {file_path}")
|
34 |
-
return data
|
35 |
-
except FileNotFoundError:
|
36 |
-
logging.error(f"File not found: {file_path}")
|
37 |
-
raise
|
38 |
-
except csv.Error as e:
|
39 |
-
logging.error(f"Error reading CSV file {file_path}: {e}")
|
40 |
-
raise
|
41 |
-
|
42 |
-
# Load data from both CSV files
|
43 |
-
try:
|
44 |
-
detailed_cases = load_csv_data('processed_medical_history.csv')
|
45 |
-
infectious_diseases = load_csv_data('infectious_diseases.csv')
|
46 |
-
except Exception as e:
|
47 |
-
logging.error(f"Failed to load CSV data: {e}")
|
48 |
-
raise
|
49 |
-
|
50 |
-
def hash_question(question: str) -> str:
|
51 |
-
"""Generate a hash for a question to check for duplicates."""
|
52 |
-
return hashlib.md5(question.encode()).hexdigest()
|
53 |
-
|
54 |
-
def load_generated_questions() -> set:
|
55 |
-
"""Load previously generated question hashes from a file."""
|
56 |
-
try:
|
57 |
-
with open('generated_questions.txt', 'r') as f:
|
58 |
-
return set(line.strip() for line in f)
|
59 |
-
except FileNotFoundError:
|
60 |
-
return set()
|
61 |
|
62 |
-
def
|
63 |
-
"""
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
68 |
|
69 |
def generate_microbiology_question() -> Dict[str, str]:
|
70 |
"""Generate a microbiology question."""
|
71 |
-
|
72 |
-
"clinical_vignette",
|
73 |
-
"mechanism_of_pathogenesis",
|
74 |
-
"laboratory_diagnosis",
|
75 |
-
"antimicrobial_resistance",
|
76 |
-
"vaccine_preventable_disease",
|
77 |
-
"microbial_physiology_genetics",
|
78 |
-
"epidemiology_transmission"
|
79 |
-
]
|
80 |
-
question_type = random.choice(question_types)
|
81 |
-
logging.info(f"Generating {question_type} question...")
|
82 |
-
|
83 |
-
if question_type == "clinical_vignette":
|
84 |
-
case = random.choice(detailed_cases)
|
85 |
-
context = f"""
|
86 |
-
Pathogen: {case['Pathogen_Name']} ({case['Pathogen_Type']})
|
87 |
-
Key Symptoms: {case['Key_Symptoms']}
|
88 |
-
Physical Findings: {case['Physical_Findings']}
|
89 |
-
Lab Results: {case['Lab_Results']}
|
90 |
-
Patient Demographics: {case['Patient_Demographics']}
|
91 |
-
"""
|
92 |
-
else:
|
93 |
-
disease = random.choice(infectious_diseases)
|
94 |
-
context = f"""
|
95 |
-
Infectious Agent: {disease['infectious_agent']}
|
96 |
-
Diagnosis: {disease['diagnosis']}
|
97 |
-
Treatment: {disease['treatment']}
|
98 |
-
"""
|
99 |
-
|
100 |
-
prompt = f"""
|
101 |
-
Create a microbiology question that could appear on the NBME exam. This should be a {question_type} question.
|
102 |
-
Use the following information as inspiration, but feel free to expand or modify:
|
103 |
-
|
104 |
-
{context}
|
105 |
-
|
106 |
-
Generate a question based on the following template, depending on the question type:
|
107 |
-
|
108 |
-
1. Clinical Vignette with Pathogen Identification:
|
109 |
-
A [age]-year-old [gender] presents with [symptoms and clinical findings]. [Additional relevant information]. Which of the following is the most likely causal organism?
|
110 |
-
|
111 |
-
2. Mechanism of Pathogenesis:
|
112 |
-
[Description of a pathogen or clinical scenario]
|
113 |
-
Which of the following best describes the mechanism by which this organism causes disease?
|
114 |
-
|
115 |
-
3. Laboratory Diagnosis:
|
116 |
-
A patient presents with [symptoms]. [Description of laboratory findings or test results].
|
117 |
-
Which of the following is the most likely diagnosis based on these laboratory findings?
|
118 |
-
|
119 |
-
4. Antimicrobial Mechanism and Resistance:
|
120 |
-
A patient is diagnosed with [infection]. The causative organism is found to be resistant to [antibiotic]. Which of the following mechanisms is most likely responsible for this resistance?
|
121 |
-
|
122 |
-
5. Vaccine-Preventable Disease:
|
123 |
-
A [age]-year-old [gender] presents with [symptoms of a vaccine-preventable disease]. Which of the following vaccines would have been most likely to prevent this condition?
|
124 |
-
|
125 |
-
6. Microbial Physiology and Genetics:
|
126 |
-
An investigator observes [description of microbial behavior or genetic phenomenon]. Which of the following best explains this observation?
|
127 |
-
|
128 |
-
7. Epidemiology and Transmission:
|
129 |
-
A cluster of [disease] cases is reported in [location]. [Description of affected population and circumstances]. Which of the following is the most likely mode of transmission?
|
130 |
-
|
131 |
-
Include:
|
132 |
-
1. The question based on the selected template
|
133 |
-
2. Five possible answer options (A through E)
|
134 |
-
3. The correct answer
|
135 |
-
4. A brief explanation of why the correct answer is right and why the other options are incorrect
|
136 |
-
5. Detailed medical reasoning for the correct answer, including relevant pathophysiology, microbiology concepts, and clinical implications.
|
137 |
-
|
138 |
-
Format the response as a JSON object with the following keys:
|
139 |
-
|
140 |
-
{{
|
141 |
-
"question": "The question text",
|
142 |
-
"options": {{
|
143 |
-
"A": "Option A text",
|
144 |
-
"B": "Option B text",
|
145 |
-
"C": "Option C text",
|
146 |
-
"D": "Option D text",
|
147 |
-
"E": "Option E text"
|
148 |
-
}},
|
149 |
-
"correct_answer": "The letter of the correct answer (A, B, C, D, or E)",
|
150 |
-
"explanation": "The explanation text",
|
151 |
-
"medical_reasoning": "The detailed medical reasoning text"
|
152 |
-
}}
|
153 |
-
"""
|
154 |
|
155 |
try:
|
156 |
chat_response = client.chat(
|
@@ -164,8 +50,14 @@ def generate_microbiology_question() -> Dict[str, str]:
|
|
164 |
response_content = chat_response.choices[0].message.content
|
165 |
logging.info(f"Received response from Mistral API: {response_content[:100]}...") # Log first 100 characters
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
# Parse the JSON response
|
168 |
-
question_data = json.loads(
|
169 |
|
170 |
# Validate the structure of the parsed JSON
|
171 |
required_keys = ["question", "options", "correct_answer", "explanation", "medical_reasoning"]
|
|
|
4 |
import logging
|
5 |
import hashlib
|
6 |
import json
|
7 |
+
import re
|
8 |
from typing import List, Dict
|
9 |
from datetime import datetime
|
10 |
from mistralai.client import MistralClient
|
|
|
24 |
# Initialize Mistral client
|
25 |
client = MistralClient(api_key=api_key)
|
26 |
|
27 |
+
# ... (previous functions remain the same)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
def extract_json_from_markdown(markdown_text: str) -> str:
|
30 |
+
"""Extract JSON content from Markdown-formatted text."""
|
31 |
+
json_match = re.search(r'```json\s*(.*?)\s*```', markdown_text, re.DOTALL)
|
32 |
+
if json_match:
|
33 |
+
return json_match.group(1)
|
34 |
+
else:
|
35 |
+
raise ValueError("No JSON content found in the Markdown text")
|
36 |
|
37 |
def generate_microbiology_question() -> Dict[str, str]:
|
38 |
"""Generate a microbiology question."""
|
39 |
+
# ... (previous code remains the same)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
try:
|
42 |
chat_response = client.chat(
|
|
|
50 |
response_content = chat_response.choices[0].message.content
|
51 |
logging.info(f"Received response from Mistral API: {response_content[:100]}...") # Log first 100 characters
|
52 |
|
53 |
+
# Extract JSON from Markdown if necessary
|
54 |
+
try:
|
55 |
+
json_content = extract_json_from_markdown(response_content)
|
56 |
+
except ValueError:
|
57 |
+
json_content = response_content # If not in Markdown, use the original content
|
58 |
+
|
59 |
# Parse the JSON response
|
60 |
+
question_data = json.loads(json_content)
|
61 |
|
62 |
# Validate the structure of the parsed JSON
|
63 |
required_keys = ["question", "options", "correct_answer", "explanation", "medical_reasoning"]
|