File size: 6,344 Bytes
00be01a f9d6962 00be01a f9d6962 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import asyncio
import requests
import pandas as pd
import re
import numpy as np
import faiss
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import Html2TextTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM
#from langchain_ollama import OllamaEmbeddings
from langchain_groq import ChatGroq
from itertools import chain
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
async def process_urls(urls):
# Load multiple URLs asynchronously
loader = AsyncChromiumLoader(urls)
docs = await loader.aload()
# Transform HTML to text
text_transformer = Html2TextTransformer()
transformed_docs = text_transformer.transform_documents(docs)
# Split the text into chunks and retain metadata
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
split_docs_nested = [text_splitter.split_documents([doc]) for doc in transformed_docs]
#split_docs = text_splitter.split_documents(transformed_docs)
split_docs = list(chain.from_iterable(split_docs_nested))
# Attach the source URL to each split document
for doc in split_docs:
doc.metadata["source_url"] = doc.metadata.get("source", "Unknown") # Ensure URL metadata exists
return split_docs
def clean_text(text):
"""Remove unnecessary whitespace, line breaks, and special characters."""
text = re.sub(r'\s+', ' ', text).strip() # Remove excessive whitespace
text = re.sub(r'\[.*?\]|\(.*?\)', '', text) # Remove bracketed text (e.g., [advert])
return text
def embed_text(text_list):
embeddings = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
#return embeddings.encode(text_list)
if embeddings is None or len(embeddings) == 0:
raise ValueError("Embedding function returned an empty result.")
return embeddings.encode(text_list)
def store_embeddings(docs):
"""Convert text into embeddings and store them in FAISS."""
#all_text = [clean_text(doc.page_content) for doc in docs if doc.page_content]
all_text = [clean_text(doc.page_content) for doc in docs if hasattr(doc, "page_content")]
text_sources = [doc.metadata["source_url"] for doc in docs]
embeddings = embed_text(all_text)
if embeddings is None or embeddings.size == 0:
raise ValueError("Embedding function returned None or empty list.")
embeddings = np.array(embeddings, dtype=np.float32)
# Normalize embeddings for better FAISS similarity search
faiss.normalize_L2(embeddings)
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d) # Inner Product (cosine similarity)
index.add(embeddings)
return index, all_text, text_sources
def search_faiss(index, query_embedding, text_data, text_sources, top_k=5, min_score=0.5):
#query_embedding = np.array([query_embedding], dtype=np.float32)
query_embedding = query_embedding.reshape(1, -1)
faiss.normalize_L2(query_embedding) # Normalize query embedding for similarity
distances, indices = index.search(query_embedding, top_k)
results = []
if indices.size > 0:
for i in range(len(indices[0])):
if distances[0][i] >= min_score: # Ignore irrelevant results
idx = indices[0][i]
if idx < len(text_data):
results.append({"source": text_sources[idx], "content": text_data[idx]})
return results
def query_llm(index, text_data, text_sources, query):
groq_api="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"
chat = ChatGroq(model="llama-3.2-1b-preview", groq_api_key=groq_api, temperature=0)
# Embed the query
query_embedding = embed_text([query])[0]
# Search FAISS for relevant documents
relevant_docs = search_faiss(index, query_embedding, text_data, text_sources, top_k=3)
print(type(relevant_docs))
print(relevant_docs)
# If no relevant docs, return a default message
if not relevant_docs:
return "No relevant information found."
# Query LLM with retrieved content
responses = []
for doc in relevant_docs:
if isinstance(doc, dict) and "source" in doc and "content" in doc:
source_url = doc["source"]
content = doc["content"][:10000]
else:
print(f"Unexpected doc format: {doc}") # Debugging print
continue
prompt = f"""
Based on the following content, answer the question: "{query}"
Content (from {source_url}):
{content}
"
"""
response = chat.invoke(prompt)
#print(type(response))
responses.append({"source": source_url, "response": response})
return responses
#urls = ["https://edition.cnn.com/", "https://www.bbc.com/", "https://www.vanguardngr.com/"]
# query = "Where is Nigeria located"
# async def main():
# urls = ["https://en.wikipedia.org/wiki/Nigeria","https://en.wikipedia.org/wiki/Ghana"] # Replace with actual URLs
# split_docs = await process_urls(urls)
# print(split_docs)
#split_docs = process_urls(urls)
#print(split_docs)
#print(split_docs)
# index, text_data, text_sources = store_embeddings(split_docs)
# query_embedding = np.array([embed_text([query])[0]])#, dtype=np.float32)
# query_embedding = query_embedding.reshape(1, -1)
#print(split_docs[0].page_content)
#print(index)
#print(text_data)
#print(text_sources)
# response = query_llm(index, text_data, text_sources, query)
# print(response)
#print(query_embedding.shape)
#relevant_docs = search_faiss(index, query_embedding, text_data, text_sources, top_k=3)
#print(relevant_docs)
"""query = "Where is Nigeria located"
async def main():
urls = ["https://en.wikipedia.org/wiki/Nigeria", "https://en.wikipedia.org/wiki/Ghana"]
split_docs = await process_urls(urls)
# Ensure split_docs is available before using it
index, text_data, text_sources = store_embeddings(split_docs)
query_embedding = np.array([embed_text([query])[0]])
query_embedding = query_embedding.reshape(1, -1)
response = query_llm(index, text_data, text_sources, query)
print(response)
# Run the async function
asyncio.run(main())""" |