Spaces:
Runtime error
Runtime error
| import os | |
| from langchain_groq import ChatGroq | |
| from langchain.prompts import ChatPromptTemplate, PromptTemplate | |
| from langchain.output_parsers import ResponseSchema, StructuredOutputParser | |
| from urllib.parse import urljoin, urlparse | |
| import requests | |
| from io import BytesIO | |
| from langchain_chroma import Chroma | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from langchain_core.prompts import ChatPromptTemplate | |
| import gradio as gr | |
| from PyPDF2 import PdfReader | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.runnables import RunnablePassthrough | |
| # Simple session management | |
| class SessionManager: | |
| def __init__(self): | |
| self.sessions = {} | |
| def get_or_create_session(self, session_id): | |
| if session_id not in self.sessions: | |
| self.sessions[session_id] = [] | |
| return self.sessions[session_id] | |
| def add_interaction(self, session_id, user_message, ai_response): | |
| session = self.get_or_create_session(session_id) | |
| session.append({"user": user_message, "ai": ai_response}) | |
| def get_history(self, session_id, max_turns=5): | |
| session = self.get_or_create_session(session_id) | |
| recent_history = session[-max_turns:] if len(session) > max_turns else session | |
| history_text = "" | |
| for interaction in recent_history: | |
| history_text += f"User: {interaction['user']}\n" | |
| history_text += f"Assistant: {interaction['ai']}\n\n" | |
| return history_text.strip() | |
| # Initialize session manager | |
| session_manager = SessionManager() | |
| groq_api_key= os.environ.get('GBV') | |
| embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1") | |
| def scrape_websites(base_urls): | |
| try: | |
| visited_links = set() # To avoid revisiting the same link | |
| content_by_url = {} # Store content from each URL | |
| for base_url in base_urls: | |
| if not base_url.strip(): | |
| continue # Skip empty or invalid URLs | |
| print(f"Scraping base URL: {base_url}") | |
| html_content = fetch_page_content(base_url) | |
| if html_content: | |
| cleaned_content = clean_body_content(html_content) | |
| content_by_url[base_url] = cleaned_content | |
| visited_links.add(base_url) | |
| # Extract and process all internal links | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| links = extract_internal_links(base_url, soup) | |
| for link in links: | |
| if link not in visited_links: | |
| print(f"Scraping link: {link}") | |
| page_content = fetch_page_content(link) | |
| if page_content: | |
| cleaned_content = clean_body_content(page_content) | |
| content_by_url[link] = cleaned_content | |
| visited_links.add(link) | |
| # If the link is a PDF file, extract its content | |
| if link.lower().endswith('.pdf'): | |
| print(f"Extracting PDF content from: {link}") | |
| pdf_content = extract_pdf_text(link) | |
| if pdf_content: | |
| content_by_url[link] = pdf_content | |
| return content_by_url | |
| except Exception as e: | |
| print(f"Error during scraping: {e}") | |
| return {} | |
| def fetch_page_content(url): | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching {url}: {e}") | |
| return None | |
| def extract_internal_links(base_url, soup): | |
| links = set() | |
| for anchor in soup.find_all("a", href=True): | |
| href = anchor["href"] | |
| full_url = urljoin(base_url, href) | |
| if is_internal_link(base_url, full_url): | |
| links.add(full_url) | |
| return links | |
| def is_internal_link(base_url, link_url): | |
| base_netloc = urlparse(base_url).netloc | |
| link_netloc = urlparse(link_url).netloc | |
| return base_netloc == link_netloc | |
| def extract_pdf_text(pdf_url): | |
| try: | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() | |
| with BytesIO(response.content) as file: | |
| reader = PdfReader(file) | |
| pdf_text = "" | |
| for page in reader.pages: | |
| pdf_text += page.extract_text() | |
| return pdf_text if pdf_text else None | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching PDF {pdf_url}: {e}") | |
| return None | |
| except Exception as e: | |
| print(f"Error reading PDF {pdf_url}: {e}") | |
| return None | |
| def clean_body_content(html_content): | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| for script_or_style in soup(["script", "style"]): | |
| script_or_style.extract() | |
| cleaned_content = soup.get_text(separator="\n") | |
| cleaned_content = "\n".join( | |
| line.strip() for line in cleaned_content.splitlines() if line.strip() | |
| ) | |
| return cleaned_content | |
| if __name__ == "__main__": | |
| website = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/" | |
| ] | |
| all_content = scrape_websites(website) | |
| temp_list = [] | |
| for url, content in all_content.items(): | |
| temp_list.append((url, content)) | |
| processed_texts = [] | |
| for element in temp_list: | |
| if isinstance(element, tuple): | |
| url, content = element | |
| processed_texts.append(f"url: {url}, content: {content}") | |
| elif isinstance(element, str): | |
| processed_texts.append(element) | |
| else: | |
| processed_texts.append(str(element)) | |
| def chunk_string(s, chunk_size=1000): | |
| return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)] | |
| chunked_texts = [] | |
| for text in processed_texts: | |
| chunked_texts.extend(chunk_string(text)) | |
| vectorstore = Chroma( | |
| collection_name="GBVR_Datst", | |
| embedding_function=embed_model, | |
| persist_directory="./", | |
| ) | |
| vectorstore.get().keys() | |
| vectorstore.add_texts(chunked_texts) | |
| # Updated template to include conversation history | |
| template = (""" | |
| You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines: | |
| 1. **Warm & Natural Interaction** | |
| - If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them. | |
| - Example responses: | |
| - "😊 Good morning! How can I assist you today?" | |
| - "Hello! What can I do for you? 🚀" | |
| 2. **Precise Information Extraction** | |
| - Provide only the relevant details from the given context: {context}. | |
| - Do not generate extra content or assumptions beyond the provided information. | |
| 3. **Conversational & Engaging Tone** | |
| - Keep responses friendly, natural, and engaging. | |
| - Use occasional emojis (e.g., 😊, 🚀) to make interactions more lively. | |
| 4. **Awareness of Real-Time Context** | |
| - If necessary, acknowledge the current date and time to show awareness of real-world updates. | |
| 5. **Handling Missing Information** | |
| - If no relevant information exists in the context, respond politely: | |
| - "I don't have that information at the moment, but I'm happy to help with something else! 😊" | |
| 6. **Personalized Interaction** | |
| - Use the conversation history to provide more personalized and contextually relevant responses. | |
| - Previous conversation history: {conversation_history} | |
| 7. **Direct, Concise Responses** | |
| - If the user requests specific data, provide only the requested details without unnecessary explanations unless asked. | |
| 8. **Extracting Relevant Links** | |
| - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly. | |
| - Example response: | |
| - "Here is the link you requested: [URL]" | |
| **Context:** {context} | |
| **User's Question:** {question} | |
| **Your Response:** | |
| """) | |
| rag_prompt = PromptTemplate.from_template(template) | |
| retriever = vectorstore.as_retriever() | |
| llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key) | |
| # Dictionary to store user sessions with session IDs | |
| user_sessions = {} | |
| # Define the RAG chain with session history | |
| def rag_chain(question, session_id="default"): | |
| # Get conversation history if available | |
| conversation_history = session_manager.get_history(session_id) | |
| # Get context from retriever | |
| context_docs = retriever.invoke(question) | |
| context = "\n".join(doc.page_content for doc in context_docs) | |
| # Create prompt with history | |
| prompt = rag_prompt.format( | |
| context=context, | |
| question=question, | |
| conversation_history=conversation_history | |
| ) | |
| # Generate response | |
| response = llm.invoke(prompt).content | |
| # Store the interaction | |
| session_manager.add_interaction(session_id, question, response) | |
| return response | |
| # Define the RAG memory stream function | |
| def rag_memory_stream(message, history): | |
| # Generate a session ID based on the first message if not exists | |
| session_id = None | |
| for msg in history: | |
| if msg[0]: # If there's a user message | |
| # Use first few characters of first message as simple session ID | |
| session_id = hash(msg[0][:20]) if session_id is None else session_id | |
| break | |
| # Default session ID if history is empty | |
| if session_id is None: | |
| session_id = "default_session" | |
| # Process the message and get response | |
| response = rag_chain(message, str(session_id)) | |
| # Stream the response word by word | |
| partial_text = "" | |
| words = response.split(' ') | |
| for word in words: | |
| partial_text += word + " " | |
| yield partial_text.strip() | |
| # Title with emojis | |
| title = "GBVR Chatbot" | |
| # Custom CSS for styling the interface | |
| custom_css = """ | |
| /* Custom CSS for styling the interface */ | |
| body { | |
| font-family: "Arial", serif; | |
| } | |
| .gradio-container { | |
| font-family: "Times New Roman", serif; | |
| } | |
| .gr-button { | |
| background-color: #007bff; /* Blue button */ | |
| color: white; | |
| border: none; | |
| border-radius: 5px; | |
| font-size: 16px; | |
| padding: 10px 20px; | |
| cursor: pointer; | |
| } | |
| .gr-textbox:focus, .gr-button:focus { | |
| outline: none; /* Remove outline focus for a cleaner look */ | |
| } | |
| /* Specific CSS for the welcome message */ | |
| .gradio-description { | |
| font-size: 30px; /* Set font size for the welcome message */ | |
| font-family: "Arial", sans-serif; | |
| text-align: center; /* Optional: Center-align the text */ | |
| padding: 20px; /* Optional: Add padding around the welcome message */ | |
| } | |
| """ | |
| # Generate a simple welcome message using the LLM | |
| def generate_welcome_message(): | |
| welcome_prompt = """ | |
| Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda. | |
| Keep it under 3 sentences, and use simple language. | |
| Make it warm and supportive but direct and easy to read. | |
| """ | |
| # Get the welcome message from the LLM | |
| welcome_message = llm.invoke(welcome_prompt).content | |
| return welcome_message | |
| # Create simple welcome message | |
| welcome_msg = generate_welcome_message() | |
| # Create the Chat Interface with welcome message | |
| demo = gr.ChatInterface( | |
| fn=rag_memory_stream, | |
| title=title, | |
| fill_height=True, | |
| theme="soft", | |
| css=custom_css, # Apply the custom CSS | |
| description=welcome_msg | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(share=True, inbrowser=True, debug=True) |