Spaces:
Sleeping
Sleeping
ambrosfitz
commited on
Commit
•
ececa0b
1
Parent(s):
afaec0f
Update question_generator.py
Browse files- question_generator.py +126 -2
question_generator.py
CHANGED
@@ -24,7 +24,49 @@ model = "mistral-large-latest"
|
|
24 |
# Initialize Mistral client
|
25 |
client = MistralClient(api_key=api_key)
|
26 |
|
27 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
def extract_json_from_markdown(markdown_text: str) -> str:
|
30 |
"""Extract JSON content from Markdown-formatted text."""
|
@@ -36,7 +78,89 @@ def extract_json_from_markdown(markdown_text: str) -> str:
|
|
36 |
|
37 |
def generate_microbiology_question() -> Dict[str, str]:
|
38 |
"""Generate a microbiology question."""
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
try:
|
42 |
chat_response = client.chat(
|
|
|
24 |
# Initialize Mistral client
|
25 |
client = MistralClient(api_key=api_key)
|
26 |
|
27 |
+
# Load data from CSV files
|
28 |
+
def load_csv_data(file_path: str) -> List[Dict[str, str]]:
|
29 |
+
"""Load data from a CSV file."""
|
30 |
+
logging.info(f"Loading data from {file_path}...")
|
31 |
+
try:
|
32 |
+
with open(file_path, 'r', encoding='utf-8') as csvfile:
|
33 |
+
reader = csv.DictReader(csvfile)
|
34 |
+
data = list(reader)
|
35 |
+
logging.info(f"Loaded {len(data)} rows from {file_path}")
|
36 |
+
return data
|
37 |
+
except FileNotFoundError:
|
38 |
+
logging.error(f"File not found: {file_path}")
|
39 |
+
raise
|
40 |
+
except csv.Error as e:
|
41 |
+
logging.error(f"Error reading CSV file {file_path}: {e}")
|
42 |
+
raise
|
43 |
+
|
44 |
+
# Load data from both CSV files
|
45 |
+
try:
|
46 |
+
detailed_cases = load_csv_data('processed_medical_history.csv')
|
47 |
+
infectious_diseases = load_csv_data('infectious_diseases.csv')
|
48 |
+
except Exception as e:
|
49 |
+
logging.error(f"Failed to load CSV data: {e}")
|
50 |
+
raise
|
51 |
+
|
52 |
+
def hash_question(question: str) -> str:
|
53 |
+
"""Generate a hash for a question to check for duplicates."""
|
54 |
+
return hashlib.md5(question.encode()).hexdigest()
|
55 |
+
|
56 |
+
def load_generated_questions() -> set:
|
57 |
+
"""Load previously generated question hashes from a file."""
|
58 |
+
try:
|
59 |
+
with open('generated_questions.txt', 'r') as f:
|
60 |
+
return set(line.strip() for line in f)
|
61 |
+
except FileNotFoundError:
|
62 |
+
return set()
|
63 |
+
|
64 |
+
def save_generated_question(question_hash: str):
|
65 |
+
"""Save a newly generated question hash to the file."""
|
66 |
+
with open('generated_questions.txt', 'a') as f:
|
67 |
+
f.write(question_hash + '\n')
|
68 |
+
|
69 |
+
generated_questions = load_generated_questions()
|
70 |
|
71 |
def extract_json_from_markdown(markdown_text: str) -> str:
|
72 |
"""Extract JSON content from Markdown-formatted text."""
|
|
|
78 |
|
79 |
def generate_microbiology_question() -> Dict[str, str]:
|
80 |
"""Generate a microbiology question."""
|
81 |
+
question_types = [
|
82 |
+
"clinical_vignette",
|
83 |
+
"mechanism_of_pathogenesis",
|
84 |
+
"laboratory_diagnosis",
|
85 |
+
"antimicrobial_resistance",
|
86 |
+
"vaccine_preventable_disease",
|
87 |
+
"microbial_physiology_genetics",
|
88 |
+
"epidemiology_transmission"
|
89 |
+
]
|
90 |
+
question_type = random.choice(question_types)
|
91 |
+
logging.info(f"Generating {question_type} question...")
|
92 |
+
|
93 |
+
if question_type == "clinical_vignette":
|
94 |
+
case = random.choice(detailed_cases)
|
95 |
+
context = f"""
|
96 |
+
Pathogen: {case['Pathogen_Name']} ({case['Pathogen_Type']})
|
97 |
+
Key Symptoms: {case['Key_Symptoms']}
|
98 |
+
Physical Findings: {case['Physical_Findings']}
|
99 |
+
Lab Results: {case['Lab_Results']}
|
100 |
+
Patient Demographics: {case['Patient_Demographics']}
|
101 |
+
"""
|
102 |
+
else:
|
103 |
+
disease = random.choice(infectious_diseases)
|
104 |
+
context = f"""
|
105 |
+
Infectious Agent: {disease['infectious_agent']}
|
106 |
+
Diagnosis: {disease['diagnosis']}
|
107 |
+
Treatment: {disease['treatment']}
|
108 |
+
"""
|
109 |
+
|
110 |
+
prompt = f"""
|
111 |
+
Create a microbiology question that could appear on the NBME exam. This should be a {question_type} question.
|
112 |
+
Use the following information as inspiration, but feel free to expand or modify:
|
113 |
+
|
114 |
+
{context}
|
115 |
+
|
116 |
+
Generate a question based on the following template, depending on the question type:
|
117 |
+
|
118 |
+
1. Clinical Vignette with Pathogen Identification:
|
119 |
+
A [age]-year-old [gender] presents with [symptoms and clinical findings]. [Additional relevant information]. Which of the following is the most likely causal organism?
|
120 |
+
|
121 |
+
2. Mechanism of Pathogenesis:
|
122 |
+
[Description of a pathogen or clinical scenario]
|
123 |
+
Which of the following best describes the mechanism by which this organism causes disease?
|
124 |
+
|
125 |
+
3. Laboratory Diagnosis:
|
126 |
+
A patient presents with [symptoms]. [Description of laboratory findings or test results].
|
127 |
+
Which of the following is the most likely diagnosis based on these laboratory findings?
|
128 |
+
|
129 |
+
4. Antimicrobial Mechanism and Resistance:
|
130 |
+
A patient is diagnosed with [infection]. The causative organism is found to be resistant to [antibiotic]. Which of the following mechanisms is most likely responsible for this resistance?
|
131 |
+
|
132 |
+
5. Vaccine-Preventable Disease:
|
133 |
+
A [age]-year-old [gender] presents with [symptoms of a vaccine-preventable disease]. Which of the following vaccines would have been most likely to prevent this condition?
|
134 |
+
|
135 |
+
6. Microbial Physiology and Genetics:
|
136 |
+
An investigator observes [description of microbial behavior or genetic phenomenon]. Which of the following best explains this observation?
|
137 |
+
|
138 |
+
7. Epidemiology and Transmission:
|
139 |
+
A cluster of [disease] cases is reported in [location]. [Description of affected population and circumstances]. Which of the following is the most likely mode of transmission?
|
140 |
+
|
141 |
+
Include:
|
142 |
+
1. The question based on the selected template
|
143 |
+
2. Five possible answer options (A through E)
|
144 |
+
3. The correct answer
|
145 |
+
4. A brief explanation of why the correct answer is right and why the other options are incorrect
|
146 |
+
5. Detailed medical reasoning for the correct answer, including relevant pathophysiology, microbiology concepts, and clinical implications.
|
147 |
+
|
148 |
+
Format the response as a JSON object with the following keys:
|
149 |
+
|
150 |
+
{{
|
151 |
+
"question": "The question text",
|
152 |
+
"options": {{
|
153 |
+
"A": "Option A text",
|
154 |
+
"B": "Option B text",
|
155 |
+
"C": "Option C text",
|
156 |
+
"D": "Option D text",
|
157 |
+
"E": "Option E text"
|
158 |
+
}},
|
159 |
+
"correct_answer": "The letter of the correct answer (A, B, C, D, or E)",
|
160 |
+
"explanation": "The explanation text",
|
161 |
+
"medical_reasoning": "The detailed medical reasoning text"
|
162 |
+
}}
|
163 |
+
"""
|
164 |
|
165 |
try:
|
166 |
chat_response = client.chat(
|