Spaces:

bisryy
/

ContentWeaverAI

Sleeping

App Files Files Community

ContentWeaverAI / src /utils.py

bisryy

fix: fix an error

2d7fa45 11 months ago

raw

history blame contribute delete

13.3 kB

	import os
	os.environ['TRANSFORMERS_CACHE'] = '/tmp/.cache'
	os.environ['HF_HOME'] = '/tmp/.cache'
	os.environ['HF_DATASETS_CACHE'] = '/tmp/.cache'
	os.environ['HF_METRICS_CACHE'] = '/tmp/.cache'

	import feedparser
	import requests
	from bs4 import BeautifulSoup
	import chromadb
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	import time
	import json
	import uuid
	from dotenv import load_dotenv

	load_dotenv()

	# --- GLOBAL SETUP ---

	# Load embedding model once
	print('Loading embedding model...')
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
	print('Embedding model loaded.')

	# Initialize Chroma client once
	client = chromadb.Client()
	collection_name = "newsletter_articles"

	# Load LLM once
	model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

	print(f"Loading LLM: {model_id}")

	from huggingface_hub import login
	hf_token = os.getenv('HF_Token')
	if hf_token:
	login(token=hf_token)
	else:
	print("HF_Token not found in environment. Check your .env file.")

	tokenizer = AutoTokenizer.from_pretrained(model_id)
	if tokenizer.pad_token is None:
	print("Warning: pad_token is None. Setting pad_token to eos_token.")
	tokenizer.pad_token = tokenizer.eos_token

	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.bfloat16
	)
	print("LLM loaded.")

	llm_pipeline = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)


	# --- MAIN FUNCTION ---

	def run_newsletter_workflow(prferences_dict):
	user_preferences = {
	"id": str(uuid.uuid4()),
	"keywords": prferences_dict.get("keywords", []),
	"preferred_tone": prferences_dict.get("preferred_tone", 'informative'),
	}

	if not user_preferences["keywords"]:
	return None, "No Keywords provided"

	rss_feed_urls = [
	"http://feeds.feedburner.com/TechCrunch/artificial-intelligence",
	"https://news.mit.edu/topic/mitcobrand-artificial-intelligence2-rss.xml",
	"https://hackingbutlegal.com/feed/",
	]

	def fetch_articles_from_feeds(feed_urls):
	articles = []
	for url in feed_urls:
	try:
	feed = feedparser.parse(url)
	for entry in feed.entries:
	articles.append({
	"id": str(uuid.uuid4()),
	"title": entry.title,
	"link": entry.link,
	"published": entry.get("published", "N/A"),
	"summary": entry.get("summary", ""),
	"content": entry.get("content", [{"value": entry.get("summary", "")}])[0].get("value", entry.get("summary", ""))
	})
	print(f"Fetched {len(feed.entries)} entries from {url}")
	time.sleep(1)
	except Exception as e:
	print(f"Error fetching feed {url}: {e}")
	return articles

	fetched_articles = fetch_articles_from_feeds(rss_feed_urls)
	print(f"\nFetched a total of {len(fetched_articles)} articles.")

	def scrape_article_content(url):
	try:
	headers = { 'User-Agent': 'MyNewsletterBot/1.0 (+http://example.com/botinfo)'}
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	main_content = soup.find('article') or soup.find('main') or soup.find('div', class_ = 'content')
	if main_content:
	text = ' '.join(main_content.stripped_strings)
	return text[:5000]
	else:
	paragraphs = soup.find_all('p')
	text = ' '.join(p.get_text() for p in paragraphs)
	return text[:5000]
	except requests.exceptions.RequestException as e:
	print(f"Scraping error for {url}: {e}")
	return None
	except Exception as e:
	print(f'Scraping Parsing error for {url}: {e}')
	return None

	for article in fetched_articles:
	print(f"Attempting to scrape: {article['link']}")
	full_content = scrape_article_content(article['link'])
	if full_content:
	article['content'] = full_content
	time.sleep(2)

	# Setup Chroma collection (delete existing, create new)
	try:
	client.delete_collection(name=collection_name)
	print(f'Deleted existing collection: {collection_name}')
	except Exception:
	pass
	collection = client.create_collection(name=collection_name)
	print(f"Created collection: {collection_name}")

	print("Adding articles to Vector DB...")
	ids_to_add = []
	embeddings_to_add = []
	documents_to_add = []
	metadata_to_add = []

	def clean_text(text):
	return ' '.join(text.split())

	for article in fetched_articles:
	cleaned_content = clean_text(article['content'])
	if not cleaned_content:
	continue

	ids_to_add.append(article['id'])
	documents_to_add.append(cleaned_content)
	metadata_to_add.append({
	"title": article['title'],
	"link": article['link'],
	"published": article['published']
	})

	embedding = embedding_model.encode(cleaned_content, convert_to_tensor=True)
	embeddings_to_add.append(embedding.tolist())

	if ids_to_add:
	collection.add(
	ids=ids_to_add,
	embeddings=embeddings_to_add,
	documents=documents_to_add,
	metadatas=metadata_to_add
	)
	print(f"Added {len(ids_to_add)} articles to the collection.")
	else:
	print("No valid articles found to add to the collection.")

	def retrieve_relevent_articles(query_keywords, top_n=5):
	if collection.count() == 0:
	print("Collection is empty. Cannot retrieve.")
	return []
	query_text = " ".join(query_keywords)
	query_embedding = embedding_model.encode(query_text, convert_to_tensor=False).tolist()

	print(f"\nQuerying for articles related to: '{query_text}'")
	results = collection.query(
	query_embeddings=[query_embedding],
	n_results=top_n,
	include=['metadatas', 'documents']
	)
	print(f'Retrieved {len(results["ids"][0])} articles.')
	return results

	relevent_articles_data = retrieve_relevent_articles(user_preferences['keywords'], top_n=3)
	print("\nRelevent data sample:")
	print(json.dumps(relevent_articles_data, indent=2))


	def generate_summary(article_content, max_length=150):
	max_input_length = 3000
	truncated_content = tokenizer.decode(
	tokenizer.encode(article_content, max_length=max_input_length, truncation=True)
	)

	messages = [
	{"role": "system", "content": "You are a helpful assistant that summarizes articles concisely."},
	{"role": "user", "content": f"Please summarize the following article:\n\n{truncated_content}\n\nSummary:"}
	]

	try:
	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	except Exception:
	prompt = (
	f"System: You are a helpful assistant that summarizes articles concisely.\n"
	f"User: Please summarize the following article:\n\n{truncated_content}\n\nSummary:\nAssistant:"
	)

	print(f"\nGenerating summary...")

	sequences = llm_pipeline(
	prompt,
	max_new_tokens=max_length + 50,
	do_sample=True,
	temperature=0.7,
	top_k=50,
	top_p=0.95,
	num_return_sequences=1,
	eos_token_id=tokenizer.eos_token_id,
	pad_token_id=tokenizer.pad_token_id
	)

	try:
	summary = sequences[0]['generated_text']

	assistant_marker = "Assistant:"
	summary_start_index = summary.rfind(assistant_marker)
	if summary_start_index != -1:
	summary = summary[summary_start_index + len(assistant_marker):].strip()
	else:
	summary = summary.replace(prompt, "").strip()

	print("Summary generated.")
	return summary
	except Exception as e:
	print(f"Error processing LLM output: {e}")
	return "Error generating summary."

	summaries = {}
	if relevent_articles_data and relevent_articles_data.get('ids'):
	for i, article_id in enumerate(relevent_articles_data['ids'][0]):
	content = relevent_articles_data['documents'][0][i]
	title = relevent_articles_data['metadatas'][0][i]['title']
	print(f"\nProcessing article: {title}")
	summaries[article_id] = generate_summary(content)
	time.sleep(1)
	else:
	print("No relevent articles retrieved to summarize.")

	def generate_commentary(summary, title, user_tone, max_length=75):

	messages = [
	{"role": "system", "content": f"You are a content curator writing brief, engaging commentary for a newsletter. Adopt a {user_tone} tone."},
	{"role": "user", "content": f"Write a short comment (1-2 sentences) about the following article summary titled '{title}'. Relate it briefly to general interests in AI if possible, but focus on being engaging.\n\nSummary: {summary}\n\nCommentary:"}
	]
	try:
	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	except Exception:
	prompt = (
	f"System: You are a content curator writing brief, engaging commentary for a newsletter. Adopt a {user_tone} tone.\n"
	f"User: Write a short comment (1-2 sentences) about the following article summary titled '{title}'. Relate it briefly to general interests in AI if possible, but focus on being engaging.\n\nSummary: {summary}\n\nCommentary:\nAssistant:"
	)

	print(f'Generating commentary for: {title}')

	sequences = llm_pipeline(
	prompt,
	max_new_tokens=max_length + 30,
	do_sample=True,
	temperature=0.8,
	top_k=50,
	top_p=0.95,
	num_return_sequences=1,
	eos_token_id=tokenizer.eos_token_id,
	pad_token_id=tokenizer.pad_token_id,
	)

	try:
	commentary = sequences[0]['generated_text']
	assistant_marker = "Assistant:"
	commentary_start_index = commentary.rfind(assistant_marker)
	if commentary_start_index != -1:
	commentary = commentary[commentary_start_index + len(assistant_marker):].strip()
	else:
	commentary = commentary.replace(prompt, "").strip()

	print('Commentary generated.')
	return commentary
	except Exception as e:
	print(f"Error processing LLM output for commentary: {e}")
	return "Error generating commentary"

	commentaries = {}
	if relevent_articles_data and relevent_articles_data.get('ids'):
	for i, article_id in enumerate(relevent_articles_data['ids'][0]):
	if article_id in summaries:
	title = relevent_articles_data['metadatas'][0][i]['title']
	summary_text = summaries[article_id]
	commentaries[article_id] = generate_commentary(summary_text, title, user_preferences["preferred_tone"])
	time.sleep(1)

	def format_newsletter(retrieved_data, summaries_dict, commentaries_dict):
	newsletter = "# Your AI Agent & Workflow Digest 📰\n\n"
	newsletter += "Here are some articles curated based on your interests:\n\n"

	if not retrieved_data or not retrieved_data.get('ids') or not retrieved_data['ids'][0]:
	newsletter += "No relevant articles found this time."
	return newsletter

	for i, article_id in enumerate(retrieved_data['ids'][0]):
	metadata = retrieved_data['metadatas'][0][i]
	summary = summaries_dict.get(article_id, "Summary not available.")
	commentary = commentaries_dict.get(article_id, "")

	newsletter += f"## {metadata['title']}\n\n"
	newsletter += f"Source: [{metadata['link']}]({metadata['link']})\n"
	newsletter += f"Published: {metadata['published']}\n\n"
	newsletter += f"Summary: {summary}\n\n"
	if commentary:
	newsletter += f"Quick Take: {commentary}\n\n"
	newsletter += "---\n\n"
	return newsletter

	final_newsletter = format_newsletter(relevent_articles_data, summaries, commentaries)

	print("\n\n--- GENERATED NEWSLETTER ---")
	print(final_newsletter)
	print("--- END OF NEWSLETTER ---")

	return final_newsletter, "Newsletter generation successful."