Web_Crawler / src /streamlit_app.py
DOMMETI's picture
Update src/streamlit_app.py
5d8edc1 verified
import streamlit as st
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from langchain_core.documents.base import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain_huggingface.chat_models import ChatHuggingFace
from langchain_huggingface.llms import HuggingFaceEndpoint
import os
# ------------------------------------------------------------------------------
# Set your API tokens
# ------------------------------------------------------------------------------
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("key")
os.environ['HF_TOKEN'] = os.getenv("key")
# ------------------------------------------------------------------------------
# Streamlit App
# ------------------------------------------------------------------------------
st.title("Web Crawler + Semantic Search + Conversational Model")
# Input for the website to crawl
url = st.text_input("Enter a website URL to crawl:")
# Input for semantic search
query = st.text_input("Enter your semantic search query:")
# Button to start the process
if st.button("Analyze and Query"):
if not url or not query:
st.error("Please provide both a URL and a semantic search query.")
else:
with st.spinner("Crawling website, retrieving documents, and generating a response..."):
async def main():
# Crawling
browser_config = BrowserConfig()
run_config = CrawlerRunConfig()
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url=url, config=run_config)
doc = Document(page_content=result.markdown.raw_markdown)
# Split documents into chunks
text_splitter = CharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
)
chunks = text_splitter.split_documents([doc])
# Embedding and Vector Store
emb = HuggingFaceEmbeddings(model='avsolatorio/GIST-small-Embedding-v0')
db = Chroma.from_documents(chunks, emb, persist_directory='chroma_db')
docs = db.similarity_search(query, k=3)
context = " ".join([d.page_content for d in docs])
# Prepare and call the chat model
deepseek_endpoint = HuggingFaceEndpoint(
repo_id='deepseek-ai/DeepSeek-Prover-V2-671B',
provider='sambanova',
temperature=0.5,
max_new_tokens=50,
task='conversational'
)
deep_seek = ChatHuggingFace(
llm=deepseek_endpoint,
repo_id='deepseek-ai/DeepSeek-Prover-V2-671B',
provider='sambanova',
temperature=0.5,
max_new_tokens=50,
task='conversational'
)
message = f"""Context:\n{context}\nQuestion:\n{query}"""
response = deep_seek.invoke([{"role": "user", "content": message}])
return response.content
response = asyncio.run(main())
st.success("Done.")
st.write("**Response from Model:**")
st.write(response)
# ------------------------------------------------------------------------------
# End of Streamlit App
# ------------------------------------------------------------------------------