Rahbarnisa commited on
Commit
9a2d4ec
·
verified ·
1 Parent(s): 212087a

Upload 24 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ data/drugmonographs.pdf filter=lfs diff=lfs merge=lfs -text
2
+ data/Example-Drug-Monograph.pdf filter=lfs diff=lfs merge=lfs -text
3
+ data/pe-009-17-gmp-guide-xannexes.pdf filter=lfs diff=lfs merge=lfs -text
4
+ data/Pharmaceutical-Manufacturing-Handbook-Production-and-Processes-Wiley-2008.pdf filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: AI Pharma Support Assistant
3
+ emoji: "💊"
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ app_file: app.py
8
+ pinned: false
9
+ ---
10
+
11
+ # AI Pharma Support Assistant (Hugging Face Spaces)
12
+
13
+ This project is now ready to run as a **Streamlit Space**.
14
+
15
+ ## Features
16
+
17
+ - Loads PDF pharma documents from `data/`
18
+ - Chunks content and builds a FAISS vector store
19
+ - Answers questions with RAG + citations
20
+ - Optional GitHub ticket creation via tool-calling
21
+
22
+ ## Required Secrets in Hugging Face Space
23
+
24
+ In your Space, go to **Settings -> Variables and secrets** and add:
25
+
26
+ - `OPENAI_API_KEY` (required)
27
+ - `GITHUB_TOKEN` (optional, for ticketing)
28
+ - `GITHUB_REPO` (optional, format: `owner/repo`)
29
+
30
+ ## Local Run
31
+
32
+ ```powershell
33
+ python -m venv venv
34
+ venv\Scripts\activate
35
+ pip install -r requirements.txt
36
+ streamlit run app.py
37
+ ```
38
+
39
+ ## Deploy to Hugging Face Spaces
40
+
41
+ 1. Create a new Space and choose **Streamlit** SDK.
42
+ 2. Push this repository to the Space Git repo.
43
+ 3. Add required secrets (`OPENAI_API_KEY` at minimum).
44
+ 4. Space will auto-build and run `app.py`.
45
+
46
+ ## Notes
47
+
48
+ - Keep `.env` local only. Do not commit secrets.
49
+ - PDF files should remain under `data/`.
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from src.chunking import chunk_documents
4
+ from src.loader import load_all_pdfs
5
+ from src.memory import add_to_memory, get_memory
6
+ from src.rag_pipelines import build_context, generate_answer, retrieve_documents
7
+ from src.vector_store import create_vector_store
8
+
9
+ st.set_page_config(page_title="AI Pharma Support", layout="wide")
10
+ st.title("AI Pharmaceutical Support Assistant")
11
+
12
+
13
+ @st.cache_resource
14
+ def load_system():
15
+ docs = load_all_pdfs()
16
+ if not docs:
17
+ raise ValueError("No PDF documents found in the data/ directory.")
18
+
19
+ chunks = chunk_documents(docs)
20
+ if not chunks:
21
+ raise ValueError("PDFs were loaded, but no text chunks were created.")
22
+
23
+ return create_vector_store(chunks)
24
+
25
+
26
+ if "chat_history" not in st.session_state:
27
+ st.session_state.chat_history = []
28
+
29
+ try:
30
+ db = load_system()
31
+ except Exception as exc:
32
+ st.error(
33
+ "App initialization failed. Check OPENAI_API_KEY, dependencies, and data/*.pdf files."
34
+ )
35
+ st.exception(exc)
36
+ st.stop()
37
+
38
+ user_input = st.text_input("Ask your pharma support question:")
39
+
40
+ if user_input:
41
+ docs = retrieve_documents(user_input, db)
42
+ context, sources = build_context(docs)
43
+ memory = get_memory()
44
+ answer = generate_answer(user_input, context, sources, memory)
45
+
46
+ add_to_memory(user_input, answer)
47
+ st.session_state.chat_history.append(("You", user_input))
48
+ st.session_state.chat_history.append(("AI", answer))
49
+
50
+ for role, message in st.session_state.chat_history:
51
+ if role == "You":
52
+ st.markdown(f"**You:** {message}")
53
+ else:
54
+ st.markdown(f"**AI:** {message}")
config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from dotenv import load_dotenv
5
+
6
+ # Load local .env for development; Hugging Face Spaces injects variables directly.
7
+ load_dotenv(dotenv_path=Path(__file__).parent / ".env")
8
+
9
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
10
+ GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
11
+ GITHUB_REPO = os.getenv("GITHUB_REPO", "")
data/Example-Drug-Monograph.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14839f729c948040dbd9e205b0699374fced68ebdb5da7ace466bfc000cfa26e
3
+ size 1166829
data/Pharmaceutical-Manufacturing-Handbook-Production-and-Processes-Wiley-2008.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6077a5c9ecafab832008c8a8937a6620102ca18df56c06a74382fff19bccd5f
3
+ size 13092552
data/drugmonographs.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b49755d3ab71a2cf0e1e99121ff9122fcf7717bfa9d70f8b2318484187318d8
3
+ size 5043791
data/pe-009-17-gmp-guide-xannexes.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06d9ed3eed8197a10738457ad2f6aecaa9357c3c9b0c19b3f2d323b2e1c64ef3
3
+ size 2368402
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchain-openai
4
+ langchain-text-splitters
5
+ openai
6
+ faiss-cpu
7
+ pypdf
8
+ tiktoken
9
+ streamlit
10
+ python-dotenv
11
+ requests
src/_.txt ADDED
File without changes
src/__init__.py ADDED
File without changes
src/__pycache__/chunking.cpython-312.pyc ADDED
Binary file (576 Bytes). View file
 
src/__pycache__/loader.cpython-312.pyc ADDED
Binary file (788 Bytes). View file
 
src/__pycache__/memory.cpython-312.pyc ADDED
Binary file (520 Bytes). View file
 
src/__pycache__/prompts.cpython-312.pyc ADDED
Binary file (553 Bytes). View file
 
src/__pycache__/rag_pipelines.cpython-312.pyc ADDED
Binary file (3.08 kB). View file
 
src/__pycache__/ticketing.cpython-312.pyc ADDED
Binary file (1.02 kB). View file
 
src/__pycache__/vector_store.cpython-312.pyc ADDED
Binary file (570 Bytes). View file
 
src/chunking.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+
3
+ def chunk_documents(documents):
4
+ text_splitter = RecursiveCharacterTextSplitter(
5
+ chunk_size=1000,
6
+ chunk_overlap=200,
7
+ separators=["\n\n", "\n", ".", " ", ""]
8
+ )
9
+
10
+ chunks = text_splitter.split_documents(documents)
11
+
12
+ return chunks
src/loader.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+
5
+
6
+ def load_all_pdfs(folder_path="data"):
7
+ """Load all PDF files from a folder into LangChain documents."""
8
+ documents = []
9
+
10
+ if not os.path.isdir(folder_path):
11
+ return documents
12
+
13
+ for file_name in sorted(os.listdir(folder_path)):
14
+ if file_name.lower().endswith(".pdf"):
15
+ loader = PyPDFLoader(os.path.join(folder_path, file_name))
16
+ documents.extend(loader.load())
17
+
18
+ return documents
src/memory.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ chat_history = []
2
+
3
+ def add_to_memory(user_input, bot_response):
4
+ chat_history.append({
5
+ "user": user_input,
6
+ "bot": bot_response
7
+ })
8
+
9
+ def get_memory():
10
+ return chat_history
src/prompts.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SYSTEM_PROMPT = """
2
+ You are a pharmaceutical employee support AI.
3
+
4
+ Rules:
5
+ - Answer only using the provided context.
6
+ - Always include citations (file name + page) when available.
7
+ - If information is missing, say: "I couldn't find this information."
8
+ - Be clear and professional.
9
+ - If the user problem is not solved or remains unclear, suggest creating a support ticket.
10
+ - If the user agrees, call the function to create a ticket.
11
+ """
src/rag_pipelines.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import List, Tuple
5
+
6
+ from openai import OpenAI
7
+
8
+ from src.prompts import SYSTEM_PROMPT
9
+ from src.ticketing import create_github_issue
10
+
11
+
12
+ def retrieve_documents(query, db, k=3):
13
+ return db.similarity_search(query, k=k)
14
+
15
+
16
+ def build_context(docs) -> Tuple[str, List[str]]:
17
+ context_parts: List[str] = []
18
+ sources: List[str] = []
19
+
20
+ for doc in docs:
21
+ context_parts.append(doc.page_content)
22
+ source = doc.metadata.get("source", "unknown")
23
+ page = doc.metadata.get("page", "?")
24
+ source_info = f"{source} - page {page}"
25
+ if source_info not in sources:
26
+ sources.append(source_info)
27
+
28
+ return "\n\n".join(context_parts), sources
29
+
30
+
31
+ def format_chat_history(memory):
32
+ messages = []
33
+ for item in memory:
34
+ messages.append({"role": "user", "content": item["user"]})
35
+ messages.append({"role": "assistant", "content": item["bot"]})
36
+ return messages
37
+
38
+
39
+ def _ticket_tool():
40
+ return [
41
+ {
42
+ "type": "function",
43
+ "function": {
44
+ "name": "create_support_ticket",
45
+ "description": "Create a support ticket when user has an unresolved issue.",
46
+ "parameters": {
47
+ "type": "object",
48
+ "properties": {
49
+ "title": {"type": "string"},
50
+ "description": {"type": "string"},
51
+ },
52
+ "required": ["title", "description"],
53
+ },
54
+ },
55
+ }
56
+ ]
57
+
58
+
59
+ def generate_answer(query, context, sources, memory):
60
+ client = OpenAI()
61
+ history_messages = format_chat_history(memory)
62
+
63
+ response = client.chat.completions.create(
64
+ model="gpt-4o-mini",
65
+ messages=[
66
+ {"role": "system", "content": SYSTEM_PROMPT},
67
+ *history_messages,
68
+ {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"},
69
+ ],
70
+ tools=_ticket_tool(),
71
+ tool_choice="auto",
72
+ )
73
+
74
+ message = response.choices[0].message
75
+
76
+ if message.tool_calls:
77
+ tool_call = message.tool_calls[0]
78
+ if tool_call.function.name == "create_support_ticket":
79
+ args = json.loads(tool_call.function.arguments)
80
+ issue_url = create_github_issue(
81
+ title=args["title"],
82
+ description=args["description"],
83
+ )
84
+ return f"Support ticket result: {issue_url}"
85
+
86
+ answer = message.content or "I could not generate an answer."
87
+ if not sources:
88
+ return answer
89
+ return answer + "\n\nSources:\n" + "\n".join(sources)
src/ticketing.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ from config import GITHUB_REPO, GITHUB_TOKEN
4
+
5
+
6
+ def create_github_issue(title: str, description: str) -> str:
7
+ """Create a GitHub issue if credentials are configured."""
8
+ if not GITHUB_TOKEN or not GITHUB_REPO:
9
+ return "Ticketing is not configured. Set GITHUB_TOKEN and GITHUB_REPO."
10
+
11
+ url = f"https://api.github.com/repos/{GITHUB_REPO}/issues"
12
+ headers = {
13
+ "Authorization": f"token {GITHUB_TOKEN}",
14
+ "Accept": "application/vnd.github+json",
15
+ }
16
+ data = {"title": title, "body": description}
17
+
18
+ response = requests.post(url, json=data, headers=headers, timeout=20)
19
+ if response.status_code == 201:
20
+ return response.json().get("html_url", "Ticket created.")
21
+
22
+ return f"Failed to create ticket ({response.status_code}): {response.text}"
src/vector_store.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import FAISS
2
+ from langchain_openai import OpenAIEmbeddings
3
+
4
+ def create_vector_store(chunks):
5
+ embeddings = OpenAIEmbeddings()
6
+
7
+ db = FAISS.from_documents(
8
+ documents=chunks,
9
+ embedding=embeddings
10
+ )
11
+
12
+ return db