Spaces:
Sleeping
Sleeping
Upload 24 files
Browse files- .gitattributes +4 -0
- README.md +49 -0
- app.py +54 -0
- config.py +11 -0
- data/Example-Drug-Monograph.pdf +3 -0
- data/Pharmaceutical-Manufacturing-Handbook-Production-and-Processes-Wiley-2008.pdf +3 -0
- data/drugmonographs.pdf +3 -0
- data/pe-009-17-gmp-guide-xannexes.pdf +3 -0
- requirements.txt +11 -0
- src/_.txt +0 -0
- src/__init__.py +0 -0
- src/__pycache__/chunking.cpython-312.pyc +0 -0
- src/__pycache__/loader.cpython-312.pyc +0 -0
- src/__pycache__/memory.cpython-312.pyc +0 -0
- src/__pycache__/prompts.cpython-312.pyc +0 -0
- src/__pycache__/rag_pipelines.cpython-312.pyc +0 -0
- src/__pycache__/ticketing.cpython-312.pyc +0 -0
- src/__pycache__/vector_store.cpython-312.pyc +0 -0
- src/chunking.py +12 -0
- src/loader.py +18 -0
- src/memory.py +10 -0
- src/prompts.py +11 -0
- src/rag_pipelines.py +89 -0
- src/ticketing.py +22 -0
- src/vector_store.py +12 -0
.gitattributes
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data/drugmonographs.pdf filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
data/Example-Drug-Monograph.pdf filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
data/pe-009-17-gmp-guide-xannexes.pdf filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
data/Pharmaceutical-Manufacturing-Handbook-Production-and-Processes-Wiley-2008.pdf filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: AI Pharma Support Assistant
|
| 3 |
+
emoji: "💊"
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
app_file: app.py
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# AI Pharma Support Assistant (Hugging Face Spaces)
|
| 12 |
+
|
| 13 |
+
This project is now ready to run as a **Streamlit Space**.
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- Loads PDF pharma documents from `data/`
|
| 18 |
+
- Chunks content and builds a FAISS vector store
|
| 19 |
+
- Answers questions with RAG + citations
|
| 20 |
+
- Optional GitHub ticket creation via tool-calling
|
| 21 |
+
|
| 22 |
+
## Required Secrets in Hugging Face Space
|
| 23 |
+
|
| 24 |
+
In your Space, go to **Settings -> Variables and secrets** and add:
|
| 25 |
+
|
| 26 |
+
- `OPENAI_API_KEY` (required)
|
| 27 |
+
- `GITHUB_TOKEN` (optional, for ticketing)
|
| 28 |
+
- `GITHUB_REPO` (optional, format: `owner/repo`)
|
| 29 |
+
|
| 30 |
+
## Local Run
|
| 31 |
+
|
| 32 |
+
```powershell
|
| 33 |
+
python -m venv venv
|
| 34 |
+
venv\Scripts\activate
|
| 35 |
+
pip install -r requirements.txt
|
| 36 |
+
streamlit run app.py
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## Deploy to Hugging Face Spaces
|
| 40 |
+
|
| 41 |
+
1. Create a new Space and choose **Streamlit** SDK.
|
| 42 |
+
2. Push this repository to the Space Git repo.
|
| 43 |
+
3. Add required secrets (`OPENAI_API_KEY` at minimum).
|
| 44 |
+
4. Space will auto-build and run `app.py`.
|
| 45 |
+
|
| 46 |
+
## Notes
|
| 47 |
+
|
| 48 |
+
- Keep `.env` local only. Do not commit secrets.
|
| 49 |
+
- PDF files should remain under `data/`.
|
app.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
from src.chunking import chunk_documents
|
| 4 |
+
from src.loader import load_all_pdfs
|
| 5 |
+
from src.memory import add_to_memory, get_memory
|
| 6 |
+
from src.rag_pipelines import build_context, generate_answer, retrieve_documents
|
| 7 |
+
from src.vector_store import create_vector_store
|
| 8 |
+
|
| 9 |
+
st.set_page_config(page_title="AI Pharma Support", layout="wide")
|
| 10 |
+
st.title("AI Pharmaceutical Support Assistant")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@st.cache_resource
|
| 14 |
+
def load_system():
|
| 15 |
+
docs = load_all_pdfs()
|
| 16 |
+
if not docs:
|
| 17 |
+
raise ValueError("No PDF documents found in the data/ directory.")
|
| 18 |
+
|
| 19 |
+
chunks = chunk_documents(docs)
|
| 20 |
+
if not chunks:
|
| 21 |
+
raise ValueError("PDFs were loaded, but no text chunks were created.")
|
| 22 |
+
|
| 23 |
+
return create_vector_store(chunks)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
if "chat_history" not in st.session_state:
|
| 27 |
+
st.session_state.chat_history = []
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
db = load_system()
|
| 31 |
+
except Exception as exc:
|
| 32 |
+
st.error(
|
| 33 |
+
"App initialization failed. Check OPENAI_API_KEY, dependencies, and data/*.pdf files."
|
| 34 |
+
)
|
| 35 |
+
st.exception(exc)
|
| 36 |
+
st.stop()
|
| 37 |
+
|
| 38 |
+
user_input = st.text_input("Ask your pharma support question:")
|
| 39 |
+
|
| 40 |
+
if user_input:
|
| 41 |
+
docs = retrieve_documents(user_input, db)
|
| 42 |
+
context, sources = build_context(docs)
|
| 43 |
+
memory = get_memory()
|
| 44 |
+
answer = generate_answer(user_input, context, sources, memory)
|
| 45 |
+
|
| 46 |
+
add_to_memory(user_input, answer)
|
| 47 |
+
st.session_state.chat_history.append(("You", user_input))
|
| 48 |
+
st.session_state.chat_history.append(("AI", answer))
|
| 49 |
+
|
| 50 |
+
for role, message in st.session_state.chat_history:
|
| 51 |
+
if role == "You":
|
| 52 |
+
st.markdown(f"**You:** {message}")
|
| 53 |
+
else:
|
| 54 |
+
st.markdown(f"**AI:** {message}")
|
config.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
# Load local .env for development; Hugging Face Spaces injects variables directly.
|
| 7 |
+
load_dotenv(dotenv_path=Path(__file__).parent / ".env")
|
| 8 |
+
|
| 9 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 10 |
+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
|
| 11 |
+
GITHUB_REPO = os.getenv("GITHUB_REPO", "")
|
data/Example-Drug-Monograph.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:14839f729c948040dbd9e205b0699374fced68ebdb5da7ace466bfc000cfa26e
|
| 3 |
+
size 1166829
|
data/Pharmaceutical-Manufacturing-Handbook-Production-and-Processes-Wiley-2008.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6077a5c9ecafab832008c8a8937a6620102ca18df56c06a74382fff19bccd5f
|
| 3 |
+
size 13092552
|
data/drugmonographs.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b49755d3ab71a2cf0e1e99121ff9122fcf7717bfa9d70f8b2318484187318d8
|
| 3 |
+
size 5043791
|
data/pe-009-17-gmp-guide-xannexes.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06d9ed3eed8197a10738457ad2f6aecaa9357c3c9b0c19b3f2d323b2e1c64ef3
|
| 3 |
+
size 2368402
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
langchain
|
| 2 |
+
langchain-community
|
| 3 |
+
langchain-openai
|
| 4 |
+
langchain-text-splitters
|
| 5 |
+
openai
|
| 6 |
+
faiss-cpu
|
| 7 |
+
pypdf
|
| 8 |
+
tiktoken
|
| 9 |
+
streamlit
|
| 10 |
+
python-dotenv
|
| 11 |
+
requests
|
src/_.txt
ADDED
|
File without changes
|
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/chunking.cpython-312.pyc
ADDED
|
Binary file (576 Bytes). View file
|
|
|
src/__pycache__/loader.cpython-312.pyc
ADDED
|
Binary file (788 Bytes). View file
|
|
|
src/__pycache__/memory.cpython-312.pyc
ADDED
|
Binary file (520 Bytes). View file
|
|
|
src/__pycache__/prompts.cpython-312.pyc
ADDED
|
Binary file (553 Bytes). View file
|
|
|
src/__pycache__/rag_pipelines.cpython-312.pyc
ADDED
|
Binary file (3.08 kB). View file
|
|
|
src/__pycache__/ticketing.cpython-312.pyc
ADDED
|
Binary file (1.02 kB). View file
|
|
|
src/__pycache__/vector_store.cpython-312.pyc
ADDED
|
Binary file (570 Bytes). View file
|
|
|
src/chunking.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 2 |
+
|
| 3 |
+
def chunk_documents(documents):
|
| 4 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 5 |
+
chunk_size=1000,
|
| 6 |
+
chunk_overlap=200,
|
| 7 |
+
separators=["\n\n", "\n", ".", " ", ""]
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
chunks = text_splitter.split_documents(documents)
|
| 11 |
+
|
| 12 |
+
return chunks
|
src/loader.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def load_all_pdfs(folder_path="data"):
|
| 7 |
+
"""Load all PDF files from a folder into LangChain documents."""
|
| 8 |
+
documents = []
|
| 9 |
+
|
| 10 |
+
if not os.path.isdir(folder_path):
|
| 11 |
+
return documents
|
| 12 |
+
|
| 13 |
+
for file_name in sorted(os.listdir(folder_path)):
|
| 14 |
+
if file_name.lower().endswith(".pdf"):
|
| 15 |
+
loader = PyPDFLoader(os.path.join(folder_path, file_name))
|
| 16 |
+
documents.extend(loader.load())
|
| 17 |
+
|
| 18 |
+
return documents
|
src/memory.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
chat_history = []
|
| 2 |
+
|
| 3 |
+
def add_to_memory(user_input, bot_response):
|
| 4 |
+
chat_history.append({
|
| 5 |
+
"user": user_input,
|
| 6 |
+
"bot": bot_response
|
| 7 |
+
})
|
| 8 |
+
|
| 9 |
+
def get_memory():
|
| 10 |
+
return chat_history
|
src/prompts.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
SYSTEM_PROMPT = """
|
| 2 |
+
You are a pharmaceutical employee support AI.
|
| 3 |
+
|
| 4 |
+
Rules:
|
| 5 |
+
- Answer only using the provided context.
|
| 6 |
+
- Always include citations (file name + page) when available.
|
| 7 |
+
- If information is missing, say: "I couldn't find this information."
|
| 8 |
+
- Be clear and professional.
|
| 9 |
+
- If the user problem is not solved or remains unclear, suggest creating a support ticket.
|
| 10 |
+
- If the user agrees, call the function to create a ticket.
|
| 11 |
+
"""
|
src/rag_pipelines.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from typing import List, Tuple
|
| 5 |
+
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
|
| 8 |
+
from src.prompts import SYSTEM_PROMPT
|
| 9 |
+
from src.ticketing import create_github_issue
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def retrieve_documents(query, db, k=3):
|
| 13 |
+
return db.similarity_search(query, k=k)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def build_context(docs) -> Tuple[str, List[str]]:
|
| 17 |
+
context_parts: List[str] = []
|
| 18 |
+
sources: List[str] = []
|
| 19 |
+
|
| 20 |
+
for doc in docs:
|
| 21 |
+
context_parts.append(doc.page_content)
|
| 22 |
+
source = doc.metadata.get("source", "unknown")
|
| 23 |
+
page = doc.metadata.get("page", "?")
|
| 24 |
+
source_info = f"{source} - page {page}"
|
| 25 |
+
if source_info not in sources:
|
| 26 |
+
sources.append(source_info)
|
| 27 |
+
|
| 28 |
+
return "\n\n".join(context_parts), sources
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def format_chat_history(memory):
|
| 32 |
+
messages = []
|
| 33 |
+
for item in memory:
|
| 34 |
+
messages.append({"role": "user", "content": item["user"]})
|
| 35 |
+
messages.append({"role": "assistant", "content": item["bot"]})
|
| 36 |
+
return messages
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _ticket_tool():
|
| 40 |
+
return [
|
| 41 |
+
{
|
| 42 |
+
"type": "function",
|
| 43 |
+
"function": {
|
| 44 |
+
"name": "create_support_ticket",
|
| 45 |
+
"description": "Create a support ticket when user has an unresolved issue.",
|
| 46 |
+
"parameters": {
|
| 47 |
+
"type": "object",
|
| 48 |
+
"properties": {
|
| 49 |
+
"title": {"type": "string"},
|
| 50 |
+
"description": {"type": "string"},
|
| 51 |
+
},
|
| 52 |
+
"required": ["title", "description"],
|
| 53 |
+
},
|
| 54 |
+
},
|
| 55 |
+
}
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def generate_answer(query, context, sources, memory):
|
| 60 |
+
client = OpenAI()
|
| 61 |
+
history_messages = format_chat_history(memory)
|
| 62 |
+
|
| 63 |
+
response = client.chat.completions.create(
|
| 64 |
+
model="gpt-4o-mini",
|
| 65 |
+
messages=[
|
| 66 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 67 |
+
*history_messages,
|
| 68 |
+
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"},
|
| 69 |
+
],
|
| 70 |
+
tools=_ticket_tool(),
|
| 71 |
+
tool_choice="auto",
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
message = response.choices[0].message
|
| 75 |
+
|
| 76 |
+
if message.tool_calls:
|
| 77 |
+
tool_call = message.tool_calls[0]
|
| 78 |
+
if tool_call.function.name == "create_support_ticket":
|
| 79 |
+
args = json.loads(tool_call.function.arguments)
|
| 80 |
+
issue_url = create_github_issue(
|
| 81 |
+
title=args["title"],
|
| 82 |
+
description=args["description"],
|
| 83 |
+
)
|
| 84 |
+
return f"Support ticket result: {issue_url}"
|
| 85 |
+
|
| 86 |
+
answer = message.content or "I could not generate an answer."
|
| 87 |
+
if not sources:
|
| 88 |
+
return answer
|
| 89 |
+
return answer + "\n\nSources:\n" + "\n".join(sources)
|
src/ticketing.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
|
| 3 |
+
from config import GITHUB_REPO, GITHUB_TOKEN
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def create_github_issue(title: str, description: str) -> str:
|
| 7 |
+
"""Create a GitHub issue if credentials are configured."""
|
| 8 |
+
if not GITHUB_TOKEN or not GITHUB_REPO:
|
| 9 |
+
return "Ticketing is not configured. Set GITHUB_TOKEN and GITHUB_REPO."
|
| 10 |
+
|
| 11 |
+
url = f"https://api.github.com/repos/{GITHUB_REPO}/issues"
|
| 12 |
+
headers = {
|
| 13 |
+
"Authorization": f"token {GITHUB_TOKEN}",
|
| 14 |
+
"Accept": "application/vnd.github+json",
|
| 15 |
+
}
|
| 16 |
+
data = {"title": title, "body": description}
|
| 17 |
+
|
| 18 |
+
response = requests.post(url, json=data, headers=headers, timeout=20)
|
| 19 |
+
if response.status_code == 201:
|
| 20 |
+
return response.json().get("html_url", "Ticket created.")
|
| 21 |
+
|
| 22 |
+
return f"Failed to create ticket ({response.status_code}): {response.text}"
|
src/vector_store.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.vectorstores import FAISS
|
| 2 |
+
from langchain_openai import OpenAIEmbeddings
|
| 3 |
+
|
| 4 |
+
def create_vector_store(chunks):
|
| 5 |
+
embeddings = OpenAIEmbeddings()
|
| 6 |
+
|
| 7 |
+
db = FAISS.from_documents(
|
| 8 |
+
documents=chunks,
|
| 9 |
+
embedding=embeddings
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
return db
|