File size: 6,263 Bytes
7ce62eb ab062c2 a5610f9 7ce62eb a5610f9 ab062c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import subprocess
import streamlit as st
import asyncio
import numpy as np
# Assume these functions exist in your scraper module
import requests
import pandas as pd
import re
import numpy as np
import faiss
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import Html2TextTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM
#from langchain_ollama import OllamaEmbeddings
from langchain_groq import ChatGroq
from itertools import chain
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
subprocess.run(["playwright", "install"], check=True)
# Scraping and Embedding Function
async def process_urls(urls):
# Load multiple URLs asynchronously
loader = AsyncChromiumLoader(urls)
docs = await loader.aload()
# Transform HTML to text
text_transformer = Html2TextTransformer()
transformed_docs = text_transformer.transform_documents(docs)
# Split the text into chunks and retain metadata
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
split_docs_nested = [text_splitter.split_documents([doc]) for doc in transformed_docs]
#split_docs = text_splitter.split_documents(transformed_docs)
split_docs = list(chain.from_iterable(split_docs_nested))
# Attach the source URL to each split document
for doc in split_docs:
doc.metadata["source_url"] = doc.metadata.get("source", "Unknown") # Ensure URL metadata exists
return split_docs
def clean_text(text):
"""Remove unnecessary whitespace, line breaks, and special characters."""
text = re.sub(r'\s+', ' ', text).strip() # Remove excessive whitespace
text = re.sub(r'\[.*?\]|\(.*?\)', '', text) # Remove bracketed text (e.g., [advert])
return text
def embed_text(text_list):
embeddings = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
#return embeddings.encode(text_list)
if embeddings is None or len(embeddings) == 0:
raise ValueError("Embedding function returned an empty result.")
return embeddings.encode(text_list)
def store_embeddings(docs):
"""Convert text into embeddings and store them in FAISS."""
#all_text = [clean_text(doc.page_content) for doc in docs if doc.page_content]
all_text = [clean_text(doc.page_content) for doc in docs if hasattr(doc, "page_content")]
text_sources = [doc.metadata["source_url"] for doc in docs]
embeddings = embed_text(all_text)
if embeddings is None or embeddings.size == 0:
raise ValueError("Embedding function returned None or empty list.")
embeddings = np.array(embeddings, dtype=np.float32)
# Normalize embeddings for better FAISS similarity search
faiss.normalize_L2(embeddings)
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d) # Inner Product (cosine similarity)
index.add(embeddings)
return index, all_text, text_sources
def search_faiss(index, query_embedding, text_data, text_sources, top_k=5, min_score=0.5):
#query_embedding = np.array([query_embedding], dtype=np.float32)
query_embedding = query_embedding.reshape(1, -1)
faiss.normalize_L2(query_embedding) # Normalize query embedding for similarity
distances, indices = index.search(query_embedding, top_k)
results = []
if indices.size > 0:
for i in range(len(indices[0])):
if distances[0][i] >= min_score: # Ignore irrelevant results
idx = indices[0][i]
if idx < len(text_data):
results.append({"source": text_sources[idx], "content": text_data[idx]})
return results
def query_llm(index, text_data, text_sources, query):
groq_api="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"
chat = ChatGroq(model="llama-3.2-1b-preview", groq_api_key=groq_api, temperature=0)
# Embed the query
query_embedding = embed_text([query])[0]
# Search FAISS for relevant documents
relevant_docs = search_faiss(index, query_embedding, text_data, text_sources, top_k=3)
print(type(relevant_docs))
print(relevant_docs)
# If no relevant docs, return a default message
if not relevant_docs:
return "No relevant information found."
# Query LLM with retrieved content
responses = []
for doc in relevant_docs:
if isinstance(doc, dict) and "source" in doc and "content" in doc:
source_url = doc["source"]
content = doc["content"][:10000]
else:
print(f"Unexpected doc format: {doc}") # Debugging print
continue
prompt = f"""
Based on the following content, answer the question: "{query}"
Content (from {source_url}):
{content}
"
"""
response = chat.invoke(prompt)
#print(type(response))
responses.append({"source": source_url, "response": response})
return responses
# Streamlit UI
st.title("Web Scraper & AI Query Interface")
urls = st.text_area("Enter URLs (one per line)", "https://en.wikipedia.org/wiki/Nigeria\nhttps://en.wikipedia.org/wiki/Ghana")
query = st.text_input("Enter your question", "Where is Nigeria located?")
if st.button("Run Scraper"):
st.write("Fetching and processing URLs...")
async def run_scraper():
url_list = urls.split("\n")
split_docs = await process_urls(url_list)
index, text_data, text_sources = store_embeddings(split_docs)
return index, text_data, text_sources
# Run async function inside Streamlit
index, text_data, text_sources = asyncio.run(run_scraper())
st.write("Data processed! Now you can ask questions about the scraped content.")
user_query = st.text_input("Ask a question about the scraped data")
if st.button("Query Model"):
query_embedding = np.array([embed_text([user_query])[0]]).reshape(1, -1)
result = query_llm(index, text_data, text_sources, user_query)
for entry in result:
st.subheader(f"Source: {entry['source']}")
st.write(f"Response: {entry['response'].content}") |