Spaces:

Rahbarnisa
/

RAG_Project

Sleeping

App Files Files Community

Rahbarnisa commited on 14 days ago

Commit

9a2d4ec

verified ·

1 Parent(s): 212087a

Upload 24 files

Browse files

Files changed (25) hide show

.gitattributes +4 -0
README.md +49 -0
app.py +54 -0
config.py +11 -0
data/Example-Drug-Monograph.pdf +3 -0
data/Pharmaceutical-Manufacturing-Handbook-Production-and-Processes-Wiley-2008.pdf +3 -0
data/drugmonographs.pdf +3 -0
data/pe-009-17-gmp-guide-xannexes.pdf +3 -0
requirements.txt +11 -0
src/_.txt +0 -0
src/__init__.py +0 -0
src/__pycache__/chunking.cpython-312.pyc +0 -0
src/__pycache__/loader.cpython-312.pyc +0 -0
src/__pycache__/memory.cpython-312.pyc +0 -0
src/__pycache__/prompts.cpython-312.pyc +0 -0
src/__pycache__/rag_pipelines.cpython-312.pyc +0 -0
src/__pycache__/ticketing.cpython-312.pyc +0 -0
src/__pycache__/vector_store.cpython-312.pyc +0 -0
src/chunking.py +12 -0
src/loader.py +18 -0
src/memory.py +10 -0
src/prompts.py +11 -0
src/rag_pipelines.py +89 -0
src/ticketing.py +22 -0
src/vector_store.py +12 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,4 @@

+data/drugmonographs.pdf filter=lfs diff=lfs merge=lfs -text
+data/Example-Drug-Monograph.pdf filter=lfs diff=lfs merge=lfs -text
+data/pe-009-17-gmp-guide-xannexes.pdf filter=lfs diff=lfs merge=lfs -text
+data/Pharmaceutical-Manufacturing-Handbook-Production-and-Processes-Wiley-2008.pdf filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+---
+title: AI Pharma Support Assistant
+emoji: "💊"
+colorFrom: blue
+colorTo: indigo
+sdk: streamlit
+app_file: app.py
+pinned: false
+---
+# AI Pharma Support Assistant (Hugging Face Spaces)
+This project is now ready to run as a **Streamlit Space**.
+## Features
+- Loads PDF pharma documents from `data/`
+- Chunks content and builds a FAISS vector store
+- Answers questions with RAG + citations
+- Optional GitHub ticket creation via tool-calling
+## Required Secrets in Hugging Face Space
+In your Space, go to **Settings -> Variables and secrets** and add:
+- `OPENAI_API_KEY` (required)
+- `GITHUB_TOKEN` (optional, for ticketing)
+- `GITHUB_REPO` (optional, format: `owner/repo`)
+## Local Run
+```powershell
+python -m venv venv
+venv\Scripts\activate
+pip install -r requirements.txt
+streamlit run app.py
+```
+## Deploy to Hugging Face Spaces
+1. Create a new Space and choose **Streamlit** SDK.
+2. Push this repository to the Space Git repo.
+3. Add required secrets (`OPENAI_API_KEY` at minimum).
+4. Space will auto-build and run `app.py`.
+## Notes
+- Keep `.env` local only. Do not commit secrets.
+- PDF files should remain under `data/`.

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import streamlit as st
+from src.chunking import chunk_documents
+from src.loader import load_all_pdfs
+from src.memory import add_to_memory, get_memory
+from src.rag_pipelines import build_context, generate_answer, retrieve_documents
+from src.vector_store import create_vector_store
+st.set_page_config(page_title="AI Pharma Support", layout="wide")
+st.title("AI Pharmaceutical Support Assistant")
+@st.cache_resource
+def load_system():
+    docs = load_all_pdfs()
+    if not docs:
+        raise ValueError("No PDF documents found in the data/ directory.")
+    chunks = chunk_documents(docs)
+    if not chunks:
+        raise ValueError("PDFs were loaded, but no text chunks were created.")
+    return create_vector_store(chunks)
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+try:
+    db = load_system()
+except Exception as exc:
+    st.error(
+        "App initialization failed. Check OPENAI_API_KEY, dependencies, and data/*.pdf files."
+    )
+    st.exception(exc)
+    st.stop()
+user_input = st.text_input("Ask your pharma support question:")
+if user_input:
+    docs = retrieve_documents(user_input, db)
+    context, sources = build_context(docs)
+    memory = get_memory()
+    answer = generate_answer(user_input, context, sources, memory)
+    add_to_memory(user_input, answer)
+    st.session_state.chat_history.append(("You", user_input))
+    st.session_state.chat_history.append(("AI", answer))
+for role, message in st.session_state.chat_history:
+    if role == "You":
+        st.markdown(f"**You:** {message}")
+    else:
+        st.markdown(f"**AI:** {message}")

config.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+from pathlib import Path
+from dotenv import load_dotenv
+# Load local .env for development; Hugging Face Spaces injects variables directly.
+load_dotenv(dotenv_path=Path(__file__).parent / ".env")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
+GITHUB_REPO = os.getenv("GITHUB_REPO", "")

data/Example-Drug-Monograph.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14839f729c948040dbd9e205b0699374fced68ebdb5da7ace466bfc000cfa26e
+size 1166829

data/Pharmaceutical-Manufacturing-Handbook-Production-and-Processes-Wiley-2008.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6077a5c9ecafab832008c8a8937a6620102ca18df56c06a74382fff19bccd5f
+size 13092552

data/drugmonographs.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b49755d3ab71a2cf0e1e99121ff9122fcf7717bfa9d70f8b2318484187318d8
+size 5043791

data/pe-009-17-gmp-guide-xannexes.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06d9ed3eed8197a10738457ad2f6aecaa9357c3c9b0c19b3f2d323b2e1c64ef3
+size 2368402

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+langchain
+langchain-community
+langchain-openai
+langchain-text-splitters
+openai
+faiss-cpu
+pypdf
+tiktoken
+streamlit
+python-dotenv
+requests

src/_.txt ADDED Viewed

File without changes

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/chunking.cpython-312.pyc ADDED Viewed

Binary file (576 Bytes). View file

src/__pycache__/loader.cpython-312.pyc ADDED Viewed

Binary file (788 Bytes). View file

src/__pycache__/memory.cpython-312.pyc ADDED Viewed

Binary file (520 Bytes). View file

src/__pycache__/prompts.cpython-312.pyc ADDED Viewed

Binary file (553 Bytes). View file

src/__pycache__/rag_pipelines.cpython-312.pyc ADDED Viewed

Binary file (3.08 kB). View file

src/__pycache__/ticketing.cpython-312.pyc ADDED Viewed

Binary file (1.02 kB). View file

src/__pycache__/vector_store.cpython-312.pyc ADDED Viewed

Binary file (570 Bytes). View file

src/chunking.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from langchain_text_splitters import RecursiveCharacterTextSplitter
+def chunk_documents(documents):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        separators=["\n\n", "\n", ".", " ", ""]
+    )
+    chunks = text_splitter.split_documents(documents)
+    return chunks

src/loader.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+from langchain_community.document_loaders import PyPDFLoader
+def load_all_pdfs(folder_path="data"):
+    """Load all PDF files from a folder into LangChain documents."""
+    documents = []
+    if not os.path.isdir(folder_path):
+        return documents
+    for file_name in sorted(os.listdir(folder_path)):
+        if file_name.lower().endswith(".pdf"):
+            loader = PyPDFLoader(os.path.join(folder_path, file_name))
+            documents.extend(loader.load())
+    return documents

src/memory.py ADDED Viewed

	@@ -0,0 +1,10 @@

+chat_history = []
+def add_to_memory(user_input, bot_response):
+    chat_history.append({
+        "user": user_input,
+        "bot": bot_response
+    })
+def get_memory():
+    return chat_history

src/prompts.py ADDED Viewed

	@@ -0,0 +1,11 @@

+SYSTEM_PROMPT = """
+You are a pharmaceutical employee support AI.
+Rules:
+- Answer only using the provided context.
+- Always include citations (file name + page) when available.
+- If information is missing, say: "I couldn't find this information."
+- Be clear and professional.
+- If the user problem is not solved or remains unclear, suggest creating a support ticket.
+- If the user agrees, call the function to create a ticket.
+"""

src/rag_pipelines.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from __future__ import annotations
+import json
+from typing import List, Tuple
+from openai import OpenAI
+from src.prompts import SYSTEM_PROMPT
+from src.ticketing import create_github_issue
+def retrieve_documents(query, db, k=3):
+    return db.similarity_search(query, k=k)
+def build_context(docs) -> Tuple[str, List[str]]:
+    context_parts: List[str] = []
+    sources: List[str] = []
+    for doc in docs:
+        context_parts.append(doc.page_content)
+        source = doc.metadata.get("source", "unknown")
+        page = doc.metadata.get("page", "?")
+        source_info = f"{source} - page {page}"
+        if source_info not in sources:
+            sources.append(source_info)
+    return "\n\n".join(context_parts), sources
+def format_chat_history(memory):
+    messages = []
+    for item in memory:
+        messages.append({"role": "user", "content": item["user"]})
+        messages.append({"role": "assistant", "content": item["bot"]})
+    return messages
+def _ticket_tool():
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": "create_support_ticket",
+                "description": "Create a support ticket when user has an unresolved issue.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "title": {"type": "string"},
+                        "description": {"type": "string"},
+                    },
+                    "required": ["title", "description"],
+                },
+            },
+        }
+    ]
+def generate_answer(query, context, sources, memory):
+    client = OpenAI()
+    history_messages = format_chat_history(memory)
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            *history_messages,
+            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"},
+        ],
+        tools=_ticket_tool(),
+        tool_choice="auto",
+    )
+    message = response.choices[0].message
+    if message.tool_calls:
+        tool_call = message.tool_calls[0]
+        if tool_call.function.name == "create_support_ticket":
+            args = json.loads(tool_call.function.arguments)
+            issue_url = create_github_issue(
+                title=args["title"],
+                description=args["description"],
+            )
+            return f"Support ticket result: {issue_url}"
+    answer = message.content or "I could not generate an answer."
+    if not sources:
+        return answer
+    return answer + "\n\nSources:\n" + "\n".join(sources)

src/ticketing.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import requests
+from config import GITHUB_REPO, GITHUB_TOKEN
+def create_github_issue(title: str, description: str) -> str:
+    """Create a GitHub issue if credentials are configured."""
+    if not GITHUB_TOKEN or not GITHUB_REPO:
+        return "Ticketing is not configured. Set GITHUB_TOKEN and GITHUB_REPO."
+    url = f"https://api.github.com/repos/{GITHUB_REPO}/issues"
+    headers = {
+        "Authorization": f"token {GITHUB_TOKEN}",
+        "Accept": "application/vnd.github+json",
+    }
+    data = {"title": title, "body": description}
+    response = requests.post(url, json=data, headers=headers, timeout=20)
+    if response.status_code == 201:
+        return response.json().get("html_url", "Ticket created.")
+    return f"Failed to create ticket ({response.status_code}): {response.text}"

src/vector_store.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+def create_vector_store(chunks):
+    embeddings = OpenAIEmbeddings()
+    db = FAISS.from_documents(
+       documents=chunks,
+       embedding=embeddings
+    )
+    return db