import os import faiss import numpy as np import fitz # PyMuPDF for PDF processing from sentence_transformers import SentenceTransformer from groq import Groq import gradio as gr import logging import pickle # Initialize logging to track events and errors logging.basicConfig(filename='query_logs.log', level=logging.INFO) # Securely load GROQ API key from environment variables grog_api_key = "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" if not grog_api_key: raise ValueError("GROQ_API_KEY environment variable not set.") client = Groq(api_key=grog_api_key) # Path to the PDF file containing pharmaceutical content book_path = 'martins-physical-pharmacy-6th-ed-2011-dr-murtadha-alshareifi.pdf' # Function to read and extract text from the PDF def read_pdf(file_path): try: doc = fitz.open(file_path) text_data = [] for page_num in range(doc.page_count): page = doc.load_page(page_num) text = page.get_text("text") text_data.append(text) return text_data except Exception as e: logging.error(f"Error reading PDF: {str(e)}") return [] # Function to split text into paragraphs def split_text_into_paragraphs(text_pages, max_tokens=300): chunks = [] for page in text_pages: paragraphs = page.split('\n\n') chunk = "" for para in paragraphs: if len(chunk) + len(para) <= max_tokens: chunk += para + "\n" else: chunks.append(chunk.strip()) chunk = para + "\n" if chunk: chunks.append(chunk.strip()) return chunks # Function to vectorize text chunks and create a FAISS index def vectorize_text(chunks, batch_size=100, save_path="embeddings.pkl"): if os.path.exists(save_path): with open(save_path, "rb") as f: index = pickle.load(f) return index, chunks try: model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = [] index = faiss.IndexFlatL2(384) for i in range(0, len(chunks), batch_size): chunk_batch = chunks[i:i + batch_size] batch_embeddings = model.encode(chunk_batch, show_progress_bar=True) embeddings.append(batch_embeddings) index.add(np.array(batch_embeddings)) with open(save_path, "wb") as f: pickle.dump(index, f) return index, chunks except Exception as e: logging.error(f"Error during vectorization: {str(e)}") return None, None # Load and vectorize PDF content text_pages = read_pdf(book_path) if not text_pages: raise RuntimeError("Failed to read PDF content. Check logs for details.") chunks = split_text_into_paragraphs(text_pages) vector_index, chunks = vectorize_text(chunks) if vector_index is None or chunks is None: raise RuntimeError("Vectorization failed. Check logs for details.") # Function to generate query embeddings def generate_query_embedding(query, model): return model.encode([query]) # Function to check relevancy based on distance threshold def check_relevancy(distances, threshold=1): return distances[0][0] <= threshold # System prompt defining the chatbot's attributes and response structure system_prompt = """ You are **PharmaExpert Pro**, an advanced chatbot specialized in the field of pharmaceutical sciences. Your responses should be structured, concise, and informative, making complex topics accessible. # Response Structure: 1. **Overview**: Start with a brief context to set the user’s expectations. 2. **Definition**: Clearly define the concept being queried. 3. **In-Depth Analysis**: Provide a detailed breakdown of concepts, including: - Examples - Relevant formulas (if applicable) - Learning processes - Working mechanisms - Purpose - Advantages and disadvantages - Role in the broader topic 4. **Summary**: Conclude with a short summary of essential takeaways, ensuring clarity and retention. # Communication Style: - **Professional yet Accessible**: Keep language rigorous yet clear. - **Concise and Informative**: Avoid excess details while covering the core information. - **Encouraging Exploration**: Foster an environment for follow-up questions. # Unique Qualities: 1. **Source-Specific Expertise**: Refer only to the provided PDF. 2. **Educational Tools**: Use summaries and key points. 3. **Adaptability**: Adjust responses based on the user’s expertise level. """ # Function to generate a single, comprehensive answer def generate_answer(query): model = SentenceTransformer('all-MiniLM-L6-v2') query_embedding = generate_query_embedding(query, model) D, I = vector_index.search(np.array(query_embedding), k=5) if check_relevancy(D): relevant_chunks = [chunks[i] for i in I[0]] combined_text = " ".join(relevant_chunks) user_prompt = f"The user has inquired about a complex pharmaceutical topic. Query: {query}" assistant_prompt = f""" Using the following context from the pharmacy PDF, respond with structured detail. **Avoid external citations in your answer.** **Context:** {combined_text} **User's question:** {query} **Response Structure:** - **Concept Overview** - **Contextual Relevance** - **Overview of the Concept** - **Definition** - **Foundations** - **Examples** (including relevant case studies) - **Formulas** (if available) - **Key Terms and Definitions** - **Key Vocabulary** - **Historical Context** - **Applications and Practical Uses** - **Step-by-Step Explanation** of processes or calculations - **Visual Aids** (suggestions for diagrams or graphs) - **Visual Aids Explanation** - **Purpose and Significance** - **Common Misconceptions** - **Key Challenges and Controversies** in the field - **Practical Exercises** - **Comparative Analysis** - **Future Implications** - **Future Directions** or potential advancements - **Cultural Context** - **Fun Activities** - **Quiz Questions** 7 quiz - **Step-by-Step Guide** - **Interactive Elements** - **Summative Table** for quick reference - **Summative Review** - **Final Summary** - **Summary** """ # **Response Structure:** # - **Overview of the concept** # - **Definition** # - **Examples** (including relevant case studies) # - **Formulas** (if available) # - **Key Terms and Definitions** # - **Historical Context** # - **Applications and Practical Uses** # - **Step-by-Step Explanation** of processes or calculations # - **Visual Aids** (suggestions for diagrams or graphs) # - **Purpose and significance** # - **Common Misconceptions** # - **Key Challenges and Controversies** in the field # - **Future Directions** or potential advancements # - **Summative Table** for quick reference # - **Final Summary** #'' # """ prompt = system_prompt + "\n\n" + user_prompt + "\n\n" + assistant_prompt response = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model="llama3-8b-8192", temperature=0.7, top_p=0.9, ) answer = response.choices[0].message.content.strip() return answer else: fallback_prompt = f"The user's question is outside the scope of the PDF content. Provide a general answer without referencing external sources." fallback_response = client.chat.completions.create( messages=[{"role": "user", "content": fallback_prompt}], model="llama3-8b-8192", temperature=0.7, top_p=0.9 ) return fallback_response.choices[0].message.content.strip() # Gradio app interface function def gradio_interface(user_query): if user_query.strip() == "": welcome_message = "Welcome to **Physical Pharmacy Book**! Ask me anything related to pharmaceutical sciences." return welcome_message response = generate_answer(user_query) return response # Gradio interface setup with gr.Blocks(css=".footer {display: none;}") as iface: gr.Markdown( """
Your advanced chatbot for pharmaceutical sciences expertise!
""", elem_id="header" ) chatbot = gr.Chatbot(type="messages", elem_id="chatbot") msg = gr.Textbox(label="Enter your query", placeholder="Type your question here...", lines=2, max_lines=5) submit_btn = gr.Button("Submit", elem_id="submit-btn") def respond(message, chat_history): chat_history.append({"role": "user", "content": message}) response = generate_answer(message) chat_history.append({"role": "assistant", "content": response}) return "", chat_history msg.submit(respond, [msg, chatbot], [msg, chatbot]) submit_btn.click(respond, [msg, chatbot], [msg, chatbot]) # Launch the Gradio app if __name__ == "__main__": iface.launch()