Spaces:

Samizie
/

WebGPT-1.0

Sleeping

App Files Files Community

WebGPT-1.0 / app.py

Samizie

Update app.py

c3f9309 verified 5 months ago

raw

history blame contribute delete

6.94 kB

	import subprocess
	import streamlit as st
	from decouple import config
	import asyncio
	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_groq import ChatGroq
	from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
	from langchain_core.messages import SystemMessage
	from scraper.scraper import process_urls
	from embedding.vector_store import initialize_vector_store, clear_chroma_db
	from conversation.talks import clean_input, small_talks

	#subprocess.run(["playwright", "install"], check=True)
	#subprocess.run(["playwright", "install-deps"], check=True)

	#Clearing ChromaDB at startup to clean up any previous data
	clear_chroma_db()




	#Groq API Key
	groq_api = config("GROQ_API_KEY")

	#Initializing LLM with memory
	llm = ChatGroq(model="llama-3.2-1b-preview", groq_api_key=groq_api, temperature=0)



	#Ensure proper asyncio handling for Windows
	import sys
	if sys.platform.startswith("win"):
	asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

	#Async helper function
	def run_asyncio_coroutine(coro):
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	return loop.run_until_complete(coro)

	import streamlit as st

	st.title("WebGPT 1.0 🤖")

	# URL inputs
	urls = st.text_area("Enter URLs (one per line)")
	run_scraper = st.button("Run Scraper", disabled=not urls.strip())

	# Sessions & states
	if "messages" not in st.session_state:
	st.session_state.messages = [] # Chat history
	if "history" not in st.session_state:
	st.session_state.history = "" # Stores past Q&A for memory
	if "scraping_done" not in st.session_state:
	st.session_state.scraping_done = False
	if "vector_store" not in st.session_state:
	st.session_state.vector_store = None

	# Run scraper
	if run_scraper:
	st.write("Fetching and processing URLs... This may take a while.")
	split_docs = run_asyncio_coroutine(process_urls(urls.split("\n")))
	st.session_state.vector_store = initialize_vector_store(split_docs)
	st.session_state.scraping_done = True
	st.success("Scraping and processing completed!")

	# ✅ Clear chat button
	if st.button("Clear Chat"):
	st.session_state.messages = [] # Reset message history
	st.session_state.history = "" # Reset history tracking
	st.success("Chat cleared!")

	# Ensuring chat only enables after scraping
	if not st.session_state.scraping_done:
	st.warning("Scrape some data first to enable chat!")
	else:
	st.write("### Chat With WebGPT 💬")

	# Display chat history
	for message in st.session_state.messages:
	role, text = message["role"], message["text"]
	with st.chat_message(role):
	st.write(text)

	# Takes in user input
	user_query = st.chat_input("Ask a question...")

	if user_query:
	st.session_state.messages.append({"role": "user", "text": user_query})
	with st.chat_message("user"):
	st.write(user_query)

	user_query_cleaned = clean_input(user_query)
	response = "" # Default value for response
	source_url = "" # Default value for source url

	# Check for small talk responses
	if user_query_cleaned in small_talks:
	response = small_talks[user_query_cleaned]
	source_url = "Knowledge base" # Small talk comes from the knowledge base

	else:
	# ✅ Setup retriever (with a similarity threshold or top-k retrieval)
	retriever = st.session_state.vector_store.as_retriever(
	search_kwargs={'k': 5}
	)

	# ✅ Retrieve context
	retrieved_docs = retriever.invoke(user_query_cleaned)
	retrieved_text = " ".join([doc.page_content for doc in retrieved_docs])

	# ✅ Define Langchain PromptTemplate properly
	system_prompt_template = PromptTemplate(
	input_variables=["context", "query"],
	template="""
	You are WebGPT, an AI assistant for question-answering tasks that only answers questions based on the provided context.

	- Understand the context {context} first and provide a relevant answer.
	- If the answer is not found in the Context, reply with: "I can't find your request in the provided context."
	- If the question is unrelated to the Context, reply with: "I can't answer that. do not generate responses."
	- Do not use external knowledge, assumptions, or filler responses. Stick to the context provided.
	- Keep responses clear, concise, and relevant to the user’s query.

	Context:
	{context}

	Now, answer the user's question:
	{input}
	"""
	)

	# ✅ Generate prompt with retrieved context & user query
	final_prompt = system_prompt_template.format(
	context=retrieved_text,
	input=user_query_cleaned
	)

	# ✅ Create chains (ensure the prompt is correct)
	scraper_chain = create_stuff_documents_chain(llm=llm, prompt=system_prompt_template)
	llm_chain = create_retrieval_chain(retriever, scraper_chain)

	# ✅ Process response and source
	if retrieved_docs:
	try:
	response_data = llm_chain.invoke({"context": retrieved_text, "input": user_query_cleaned})
	response = response_data.get("answer", "").strip()
	source_url = retrieved_docs[0].metadata.get("source", "Unknown")

	# Fallback if response is still empty
	if not response:
	response = "I can't find your request in the provided context."
	source_url = "No source found"

	except Exception as e:
	response = f"Error generating response: {str(e)}"
	source_url = "Error"

	else:
	response = "I can't find your request in the provided context."
	source_url = "No source found"

	# ✅ Track history & update session state
	history_text = "\n".join(
	[f"User: {msg['text']}" if msg["role"] == "user" else f"AI: {msg['text']}" for msg in st.session_state.messages]
	)
	st.session_state.history = history_text

	# ✅ Format and display response
	formatted_response = f"Answer: {response}"
	if response != "I can't find your request in the provided context." and source_url:
	formatted_response += f"\n\nSource: {source_url}"

	st.session_state.messages.append({"role": "assistant", "text": formatted_response})
	with st.chat_message("assistant"):
	st.write(formatted_response)