oceansweep's picture
Upload 28 files
45e1f81 verified
raw
history blame
4.61 kB
# First gen
# Install the necessary libraries
# !pip install transformers
# !pip install sentence-transformers
# !pip install torch
# !pip install requests
# !pip install bs4
import requests
from bs4 import BeautifulSoup
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
import torch
# Step 1: Load Models for Summarization and Similarity
model_name = "facebook/bart-large-cnn" # Summarization model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
# Sentence similarity model
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
# Step 2: Define Retrieval Evaluator
def evaluate_retrieval(query, retrieved_docs):
"""
Evaluate the relevance of retrieved documents using cosine similarity
with sentence embeddings.
"""
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
doc_embeddings = similarity_model.encode(retrieved_docs, convert_to_tensor=True)
# Calculate cosine similarity between the query and each document
similarities = [util.pytorch_cos_sim(query_embedding, doc_embedding).item() for doc_embedding in doc_embeddings]
# Set a threshold for relevance (adjustable)
relevance_threshold = 0.5
relevance_scores = ['Correct' if sim > relevance_threshold else 'Incorrect' for sim in similarities]
return relevance_scores
# Step 3: Knowledge Refinement (Decompose-then-Recompose)
def decompose_then_recompose(retrieved_docs):
"""
Refine the retrieved documents by summarizing their key information.
"""
refined_knowledge = []
for doc in retrieved_docs:
summary = summarizer(doc, max_length=50, min_length=20, do_sample=False)[0]['summary_text']
refined_knowledge.append(summary)
return refined_knowledge
# Step 4: Web Search for External Knowledge
def web_search(query):
"""
Perform a web search to retrieve additional external knowledge if the
retrieved documents are not relevant.
"""
search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(search_url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract URLs from search results (simplified)
links = []
for item in soup.find_all('a'):
link = item.get('href')
if link and "http" in link:
links.append(link)
return links[:5] # Return the first 5 URLs
# Step 5: Generate Final Output
def generate_final_output(query, refined_knowledge):
"""
Generate the final output summary using the refined knowledge.
"""
combined_knowledge = " ".join(refined_knowledge)
final_summary = summarizer(combined_knowledge, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
return final_summary
# Step 6: CRAG Workflow Integration
def crag_workflow(query, retrieved_docs):
"""
Full CRAG workflow integrating evaluation, knowledge refinement,
and web search to generate a robust output summary.
"""
# Step 1: Evaluate retrieval
relevance_scores = evaluate_retrieval(query, retrieved_docs)
if 'Correct' in relevance_scores:
# Step 2: Decompose-then-Recompose for correct documents
refined_knowledge = decompose_then_recompose(
[doc for doc, score in zip(retrieved_docs, relevance_scores) if score == 'Correct'])
else:
# Step 3: Web search if retrieval is incorrect
web_results = web_search(query)
refined_knowledge = decompose_then_recompose(web_results)
# Step 4: Generate final output
final_summary = generate_final_output(query, refined_knowledge)
return final_summary
# Example Usage
if __name__ == "__main__":
# Example query and retrieved documents
query = "What are the latest advancements in renewable energy?"
retrieved_docs = [
"Renewable energy is becoming increasingly important in today's world...",
"Solar energy has seen significant advancements in the past decade...",
"Wind energy technology is rapidly evolving, with new innovations expected soon..."
]
# Perform the CRAG workflow
final_summary = crag_workflow(query, retrieved_docs)
print("Final Summary:", final_summary)