ambrosfitz commited on
Commit
afaec0f
1 Parent(s): b5ae83c

Update question_generator.py

Browse files
Files changed (1) hide show
  1. question_generator.py +17 -125
question_generator.py CHANGED
@@ -4,6 +4,7 @@ import os
4
  import logging
5
  import hashlib
6
  import json
 
7
  from typing import List, Dict
8
  from datetime import datetime
9
  from mistralai.client import MistralClient
@@ -23,134 +24,19 @@ model = "mistral-large-latest"
23
  # Initialize Mistral client
24
  client = MistralClient(api_key=api_key)
25
 
26
- def load_csv_data(file_path: str) -> List[Dict[str, str]]:
27
- """Load data from a CSV file."""
28
- logging.info(f"Loading data from {file_path}...")
29
- try:
30
- with open(file_path, 'r', encoding='utf-8') as csvfile:
31
- reader = csv.DictReader(csvfile)
32
- data = list(reader)
33
- logging.info(f"Loaded {len(data)} rows from {file_path}")
34
- return data
35
- except FileNotFoundError:
36
- logging.error(f"File not found: {file_path}")
37
- raise
38
- except csv.Error as e:
39
- logging.error(f"Error reading CSV file {file_path}: {e}")
40
- raise
41
-
42
- # Load data from both CSV files
43
- try:
44
- detailed_cases = load_csv_data('processed_medical_history.csv')
45
- infectious_diseases = load_csv_data('infectious_diseases.csv')
46
- except Exception as e:
47
- logging.error(f"Failed to load CSV data: {e}")
48
- raise
49
-
50
- def hash_question(question: str) -> str:
51
- """Generate a hash for a question to check for duplicates."""
52
- return hashlib.md5(question.encode()).hexdigest()
53
-
54
- def load_generated_questions() -> set:
55
- """Load previously generated question hashes from a file."""
56
- try:
57
- with open('generated_questions.txt', 'r') as f:
58
- return set(line.strip() for line in f)
59
- except FileNotFoundError:
60
- return set()
61
 
62
- def save_generated_question(question_hash: str):
63
- """Save a newly generated question hash to the file."""
64
- with open('generated_questions.txt', 'a') as f:
65
- f.write(question_hash + '\n')
66
-
67
- generated_questions = load_generated_questions()
 
68
 
69
  def generate_microbiology_question() -> Dict[str, str]:
70
  """Generate a microbiology question."""
71
- question_types = [
72
- "clinical_vignette",
73
- "mechanism_of_pathogenesis",
74
- "laboratory_diagnosis",
75
- "antimicrobial_resistance",
76
- "vaccine_preventable_disease",
77
- "microbial_physiology_genetics",
78
- "epidemiology_transmission"
79
- ]
80
- question_type = random.choice(question_types)
81
- logging.info(f"Generating {question_type} question...")
82
-
83
- if question_type == "clinical_vignette":
84
- case = random.choice(detailed_cases)
85
- context = f"""
86
- Pathogen: {case['Pathogen_Name']} ({case['Pathogen_Type']})
87
- Key Symptoms: {case['Key_Symptoms']}
88
- Physical Findings: {case['Physical_Findings']}
89
- Lab Results: {case['Lab_Results']}
90
- Patient Demographics: {case['Patient_Demographics']}
91
- """
92
- else:
93
- disease = random.choice(infectious_diseases)
94
- context = f"""
95
- Infectious Agent: {disease['infectious_agent']}
96
- Diagnosis: {disease['diagnosis']}
97
- Treatment: {disease['treatment']}
98
- """
99
-
100
- prompt = f"""
101
- Create a microbiology question that could appear on the NBME exam. This should be a {question_type} question.
102
- Use the following information as inspiration, but feel free to expand or modify:
103
-
104
- {context}
105
-
106
- Generate a question based on the following template, depending on the question type:
107
-
108
- 1. Clinical Vignette with Pathogen Identification:
109
- A [age]-year-old [gender] presents with [symptoms and clinical findings]. [Additional relevant information]. Which of the following is the most likely causal organism?
110
-
111
- 2. Mechanism of Pathogenesis:
112
- [Description of a pathogen or clinical scenario]
113
- Which of the following best describes the mechanism by which this organism causes disease?
114
-
115
- 3. Laboratory Diagnosis:
116
- A patient presents with [symptoms]. [Description of laboratory findings or test results].
117
- Which of the following is the most likely diagnosis based on these laboratory findings?
118
-
119
- 4. Antimicrobial Mechanism and Resistance:
120
- A patient is diagnosed with [infection]. The causative organism is found to be resistant to [antibiotic]. Which of the following mechanisms is most likely responsible for this resistance?
121
-
122
- 5. Vaccine-Preventable Disease:
123
- A [age]-year-old [gender] presents with [symptoms of a vaccine-preventable disease]. Which of the following vaccines would have been most likely to prevent this condition?
124
-
125
- 6. Microbial Physiology and Genetics:
126
- An investigator observes [description of microbial behavior or genetic phenomenon]. Which of the following best explains this observation?
127
-
128
- 7. Epidemiology and Transmission:
129
- A cluster of [disease] cases is reported in [location]. [Description of affected population and circumstances]. Which of the following is the most likely mode of transmission?
130
-
131
- Include:
132
- 1. The question based on the selected template
133
- 2. Five possible answer options (A through E)
134
- 3. The correct answer
135
- 4. A brief explanation of why the correct answer is right and why the other options are incorrect
136
- 5. Detailed medical reasoning for the correct answer, including relevant pathophysiology, microbiology concepts, and clinical implications.
137
-
138
- Format the response as a JSON object with the following keys:
139
-
140
- {{
141
- "question": "The question text",
142
- "options": {{
143
- "A": "Option A text",
144
- "B": "Option B text",
145
- "C": "Option C text",
146
- "D": "Option D text",
147
- "E": "Option E text"
148
- }},
149
- "correct_answer": "The letter of the correct answer (A, B, C, D, or E)",
150
- "explanation": "The explanation text",
151
- "medical_reasoning": "The detailed medical reasoning text"
152
- }}
153
- """
154
 
155
  try:
156
  chat_response = client.chat(
@@ -164,8 +50,14 @@ def generate_microbiology_question() -> Dict[str, str]:
164
  response_content = chat_response.choices[0].message.content
165
  logging.info(f"Received response from Mistral API: {response_content[:100]}...") # Log first 100 characters
166
 
 
 
 
 
 
 
167
  # Parse the JSON response
168
- question_data = json.loads(response_content)
169
 
170
  # Validate the structure of the parsed JSON
171
  required_keys = ["question", "options", "correct_answer", "explanation", "medical_reasoning"]
 
4
  import logging
5
  import hashlib
6
  import json
7
+ import re
8
  from typing import List, Dict
9
  from datetime import datetime
10
  from mistralai.client import MistralClient
 
24
  # Initialize Mistral client
25
  client = MistralClient(api_key=api_key)
26
 
27
+ # ... (previous functions remain the same)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ def extract_json_from_markdown(markdown_text: str) -> str:
30
+ """Extract JSON content from Markdown-formatted text."""
31
+ json_match = re.search(r'```json\s*(.*?)\s*```', markdown_text, re.DOTALL)
32
+ if json_match:
33
+ return json_match.group(1)
34
+ else:
35
+ raise ValueError("No JSON content found in the Markdown text")
36
 
37
  def generate_microbiology_question() -> Dict[str, str]:
38
  """Generate a microbiology question."""
39
+ # ... (previous code remains the same)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  try:
42
  chat_response = client.chat(
 
50
  response_content = chat_response.choices[0].message.content
51
  logging.info(f"Received response from Mistral API: {response_content[:100]}...") # Log first 100 characters
52
 
53
+ # Extract JSON from Markdown if necessary
54
+ try:
55
+ json_content = extract_json_from_markdown(response_content)
56
+ except ValueError:
57
+ json_content = response_content # If not in Markdown, use the original content
58
+
59
  # Parse the JSON response
60
+ question_data = json.loads(json_content)
61
 
62
  # Validate the structure of the parsed JSON
63
  required_keys = ["question", "options", "correct_answer", "explanation", "medical_reasoning"]