import streamlit as st import asyncio from crawl4ai import AsyncWebCrawler from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from langchain_core.documents.base import Document from langchain.text_splitter import CharacterTextSplitter from langchain_huggingface.embeddings import HuggingFaceEmbeddings from langchain.vectorstores.chroma import Chroma from langchain_huggingface.chat_models import ChatHuggingFace from langchain_huggingface.llms import HuggingFaceEndpoint import os # ------------------------------------------------------------------------------ # Set your API tokens # ------------------------------------------------------------------------------ os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("key") os.environ['HF_TOKEN'] = os.getenv("key") # ------------------------------------------------------------------------------ # Streamlit App # ------------------------------------------------------------------------------ st.title("Web Crawler + Semantic Search + Conversational Model") # Input for the website to crawl url = st.text_input("Enter a website URL to crawl:") # Input for semantic search query = st.text_input("Enter your semantic search query:") # Button to start the process if st.button("Analyze and Query"): if not url or not query: st.error("Please provide both a URL and a semantic search query.") else: with st.spinner("Crawling website, retrieving documents, and generating a response..."): async def main(): # Crawling browser_config = BrowserConfig() run_config = CrawlerRunConfig() async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url=url, config=run_config) doc = Document(page_content=result.markdown.raw_markdown) # Split documents into chunks text_splitter = CharacterTextSplitter( chunk_size=1000, chunk_overlap=100, ) chunks = text_splitter.split_documents([doc]) # Embedding and Vector Store emb = HuggingFaceEmbeddings(model='avsolatorio/GIST-small-Embedding-v0') db = Chroma.from_documents(chunks, emb, persist_directory='chroma_db') docs = db.similarity_search(query, k=3) context = " ".join([d.page_content for d in docs]) # Prepare and call the chat model deepseek_endpoint = HuggingFaceEndpoint( repo_id='deepseek-ai/DeepSeek-Prover-V2-671B', provider='sambanova', temperature=0.5, max_new_tokens=50, task='conversational' ) deep_seek = ChatHuggingFace( llm=deepseek_endpoint, repo_id='deepseek-ai/DeepSeek-Prover-V2-671B', provider='sambanova', temperature=0.5, max_new_tokens=50, task='conversational' ) message = f"""Context:\n{context}\nQuestion:\n{query}""" response = deep_seek.invoke([{"role": "user", "content": message}]) return response.content response = asyncio.run(main()) st.success("Done.") st.write("**Response from Model:**") st.write(response) # ------------------------------------------------------------------------------ # End of Streamlit App # ------------------------------------------------------------------------------