Spaces:
Running
Running
import streamlit as st | |
import asyncio | |
from crawl4ai import AsyncWebCrawler | |
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig | |
from langchain_core.documents.base import Document | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain_huggingface.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores.chroma import Chroma | |
from langchain_huggingface.chat_models import ChatHuggingFace | |
from langchain_huggingface.llms import HuggingFaceEndpoint | |
import os | |
# ------------------------------------------------------------------------------ | |
# Set your API tokens | |
# ------------------------------------------------------------------------------ | |
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("key") | |
os.environ['HF_TOKEN'] = os.getenv("key") | |
# ------------------------------------------------------------------------------ | |
# Streamlit App | |
# ------------------------------------------------------------------------------ | |
st.title("Web Crawler + Semantic Search + Conversational Model") | |
# Input for the website to crawl | |
url = st.text_input("Enter a website URL to crawl:") | |
# Input for semantic search | |
query = st.text_input("Enter your semantic search query:") | |
# Button to start the process | |
if st.button("Analyze and Query"): | |
if not url or not query: | |
st.error("Please provide both a URL and a semantic search query.") | |
else: | |
with st.spinner("Crawling website, retrieving documents, and generating a response..."): | |
async def main(): | |
# Crawling | |
browser_config = BrowserConfig() | |
run_config = CrawlerRunConfig() | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun(url=url, config=run_config) | |
doc = Document(page_content=result.markdown.raw_markdown) | |
# Split documents into chunks | |
text_splitter = CharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=100, | |
) | |
chunks = text_splitter.split_documents([doc]) | |
# Embedding and Vector Store | |
emb = HuggingFaceEmbeddings(model='avsolatorio/GIST-small-Embedding-v0') | |
db = Chroma.from_documents(chunks, emb, persist_directory='chroma_db') | |
docs = db.similarity_search(query, k=3) | |
context = " ".join([d.page_content for d in docs]) | |
# Prepare and call the chat model | |
deepseek_endpoint = HuggingFaceEndpoint( | |
repo_id='deepseek-ai/DeepSeek-Prover-V2-671B', | |
provider='sambanova', | |
temperature=0.5, | |
max_new_tokens=50, | |
task='conversational' | |
) | |
deep_seek = ChatHuggingFace( | |
llm=deepseek_endpoint, | |
repo_id='deepseek-ai/DeepSeek-Prover-V2-671B', | |
provider='sambanova', | |
temperature=0.5, | |
max_new_tokens=50, | |
task='conversational' | |
) | |
message = f"""Context:\n{context}\nQuestion:\n{query}""" | |
response = deep_seek.invoke([{"role": "user", "content": message}]) | |
return response.content | |
response = asyncio.run(main()) | |
st.success("Done.") | |
st.write("**Response from Model:**") | |
st.write(response) | |
# ------------------------------------------------------------------------------ | |
# End of Streamlit App | |
# ------------------------------------------------------------------------------ | |