ClinicianAssistant / scripts /process_idsr.py
JDFPalladium
cleaning up organization of scripts and data and updating filepaths in app to processed data
389c5f0
import os
import re
import json
from dotenv import load_dotenv
from openai import OpenAI
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from rapidfuzz import fuzz
# === Setup ===
base_dir = os.path.dirname(__file__)
raw_path = os.path.abspath(os.path.join(base_dir, "data", "raw"))
processed_path = os.path.abspath(os.path.join(base_dir, "data", "processed"))
os.makedirs(processed_path, exist_ok=True)
load_dotenv(os.path.join(base_dir, "config.env"))
api_key = os.environ.get("OPENAI_API_KEY")
# === Step 1: Read IDSR Text ===
with open(os.path.join(raw_path, "IDSR.txt"), encoding="utf-8") as f:
text = f.read()
# === Step 2: Extract Keywords via GPT ===
prompt = """
You are a helpful assistant. Extract a list of 30–50 key symptoms, signs, or diagnostic terms from the following disease descriptions.
Focus on words or phrases that are likely to appear in clinical case definitions or user queries β€” such as "fever", "skin lesions", "swollen lymph nodes", "positive blood smear", etc.
Only return the keywords or short phrases β€” one per line.
Text:
"""
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt + text}
],
temperature=0.0
)
# Normalize keywords
keywords = [line.strip() for line in response.choices[0].message.content.splitlines() if line.strip()]
def normalize_kw(kw):
return kw.lstrip("-β€’ ").strip().lower()
keywords = [normalize_kw(kw) for kw in keywords]
# Save keywords
kw_path = os.path.join(processed_path, "idsr_keywords.txt")
with open(kw_path, "w", encoding="utf-8") as f:
for keyword in keywords:
f.write(f"{keyword}\n")
print(f"βœ… Saved keywords to {kw_path}")
# === Step 3: Parse Disease Sections ===
def parse_disease_text(text):
diseases = []
lines = text.strip().splitlines()
current_disease = None
current_subsection = None
buffer = []
def finalize_subsection():
if current_disease is not None and current_subsection and buffer:
content = " ".join(line.strip() for line in buffer).strip()
current_disease[current_subsection] = content
subsection_pattern = re.compile(r"^-\s*(.+):\s*$")
for line in lines + [""]:
if not line.strip():
finalize_subsection()
if current_disease:
diseases.append(current_disease)
current_disease = None
current_subsection = None
buffer = []
continue
if current_disease is None:
current_disease = {"disease_name": line.strip()}
continue
match = subsection_pattern.match(line)
if match:
finalize_subsection()
current_subsection = match.group(1).strip()
buffer = []
else:
buffer.append(line.rstrip())
return diseases
disease_dicts = parse_disease_text(text)
# === Step 4: Convert to LangChain Documents ===
def convert_disease_dicts_to_documents(disease_dicts):
docs = []
for disease in disease_dicts:
disease_name = disease.get("disease_name", "")
subsections = [f"{key}:\n{value}" for key, value in disease.items() if key != "disease_name"]
full_text = f"Disease: {disease_name}\n\n" + "\n\n".join(subsections)
docs.append(Document(page_content=full_text, metadata={"disease_name": disease_name}))
return docs
documents = convert_disease_dicts_to_documents(disease_dicts)
# === Step 5: Tag Documents with Keywords ===
def tag_documents_with_keywords(documents, keywords, threshold=85):
tagged = []
for doc in documents:
content = doc.page_content.lower()
matched = [kw for kw in keywords if fuzz.partial_ratio(kw.lower(), content) >= threshold]
doc.metadata["matched_keywords"] = matched
tagged.append(doc)
return tagged
tagged_documents = tag_documents_with_keywords(documents, keywords)
# Save JSON version
json_path = os.path.join(processed_path, "tagged_documents.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump([doc.dict() for doc in tagged_documents], f, ensure_ascii=False, indent=2)
print(f"βœ… Saved tagged documents to {json_path}")
# === Step 6: Build and Save FAISS Vectorstore ===
embedding_model = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(tagged_documents, embedding_model)
vs_path = os.path.join(processed_path, "disease_vectorstore")
vectorstore.save_local(vs_path)
print(f"βœ… Saved FAISS vectorstore to {vs_path}")