Spaces:
Sleeping
Sleeping
File size: 8,994 Bytes
4ec04a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
from smolagents import Tool
from typing import Any, Optional
class SimpleTool(Tool):
name = "advanced_web_analyzer"
description = "Advanced web content analyzer with ML-powered analysis capabilities."
inputs = {"url":{"type":"string","description":"The webpage URL to analyze."},"analysis_mode":{"type":"string","nullable":True,"description":"Analysis mode ('analyze', 'search', 'summarize', 'sentiment', 'topics')."},"query":{"type":"string","nullable":True,"description":"Optional search term for 'search' mode."},"language":{"type":"string","nullable":True,"description":"Content language (default: 'en')."}}
output_type = "string"
def forward(self, url: str,
analysis_mode: str = "analyze",
query: Optional[str] = None,
language: str = "en") -> str:
"""Advanced web content analyzer with ML-powered analysis capabilities.
Args:
url: The webpage URL to analyze.
analysis_mode: Analysis mode ('analyze', 'search', 'summarize', 'sentiment', 'topics').
query: Optional search term for 'search' mode.
language: Content language (default: 'en').
Returns:
str: Advanced analysis of web content.
"""
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
from collections import Counter
from transformers import pipeline
try:
# Validate URL
parsed_url = urlparse(url)
if not all([parsed_url.scheme, parsed_url.netloc]):
return "Error: Invalid URL format. Please provide a valid URL."
# Fetch webpage
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Parse content
soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup(['script', 'style', 'meta']):
tag.decompose()
# Extract basic elements
title = soup.title.string if soup.title else "No title found"
title = re.sub(r'\s+', ' ', title).strip() if title else "No title found"
text_content = soup.get_text()
text_content = re.sub(r'\s+', ' ', text_content).strip()
# Process based on mode
if analysis_mode == "analyze":
# Initialize ML pipelines
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
classifier = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")
# Get word statistics
words = text_content.lower().split()
word_count = len(words)
word_freq = Counter(words).most_common(5)
common_words = ""
for word, count in word_freq:
common_words += f"- {word}: {count} times\n"
# Get summary
summary = summarizer(text_content[:1024], max_length=100, min_length=30)[0]['summary_text']
# Get sentiment
sentiment = classifier(text_content[:512])[0]
sentiment_score = int(sentiment['label'][0]) # Convert '5 stars' to number
sentiment_desc = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"][sentiment_score-1]
# Format comprehensive analysis
return f"""π Comprehensive Content Analysis
π Basic Information:
Title: {title}
Word Count: {word_count}
Reading Time: {word_count // 200} minutes
π Quick Summary:
{summary}
π Content Sentiment:
{sentiment_desc} ({sentiment_score}/5 stars)
π Most Common Words:
{common_words}"""
elif analysis_mode == "summarize":
# Use BART for better summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Split into chunks if text is too long
chunks = [text_content[i:i+1024] for i in range(0, len(text_content), 1024)]
summaries = []
for chunk in chunks[:3]: # Process up to 3 chunks
if len(chunk.strip()) > 100:
summary = summarizer(chunk, max_length=100, min_length=30)[0]['summary_text']
summaries.append(summary)
return f"""π Content Summary for: {title}
{' '.join(summaries)}"""
elif analysis_mode == "sentiment":
# Use multilingual sentiment analyzer
classifier = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")
# Analyze main content and paragraphs
main_sentiment = classifier(text_content[:512])[0]
# Analyze individual paragraphs
paragraphs = soup.find_all('p')
detailed_sentiments = ""
para_count = 0
for p in paragraphs:
text = p.text.strip()
if len(text) > 50: # Only analyze meaningful paragraphs
sentiment = classifier(text[:512])[0]
score = int(sentiment['label'][0])
mood = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"][score-1]
detailed_sentiments += f"\nParagraph {para_count + 1}: {mood} ({score}/5)"
para_count += 1
if para_count >= 5: # Limit to 5 paragraphs
break
return f"""π Sentiment Analysis
Overall Sentiment: {["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"][int(main_sentiment['label'][0])-1]}
Overall Score: {main_sentiment['label'][0]}/5
Detailed Analysis:{detailed_sentiments}"""
elif analysis_mode == "topics":
# Use Zero-shot classification for topic detection
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# Define potential topics
topics = [
"Technology", "Business", "Politics", "Science",
"Health", "Entertainment", "Sports", "Education",
"Environment", "Culture"
]
# Analyze main content
topic_results = classifier(text_content[:512], topics)
# Format results
topic_analysis = "Main Topics:\n"
for topic, score in zip(topic_results['labels'], topic_results['scores']):
if score > 0.1: # Only show relevant topics
topic_analysis += f"- {topic}: {score*100:.1f}% confidence\n"
# Get key phrases
from keybert import KeyBERT
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(text_content[:5000], keyphrase_ngram_range=(1, 2), stop_words='english', top_n=5)
key_phrases = "\nKey Phrases:\n"
for phrase, score in keywords:
key_phrases += f"- {phrase}: {score:.2f} relevance\n"
return f"""π― Topic Analysis
{topic_analysis}
{key_phrases}"""
elif analysis_mode == "search":
if not query:
return "Error: Search query is required for search mode."
# Use transformers for better search
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
# Search in paragraphs
paragraphs = soup.find_all('p')
search_results = ""
result_count = 0
for p in paragraphs:
text = p.text.strip()
if len(text) > 50 and query.lower() in text.lower():
# Get AI-enhanced answer
qa_result = qa_pipeline(question=query, context=text)
search_results += f"\n{result_count + 1}. Found in context: {qa_result['answer']}\n"
search_results += f" Confidence: {qa_result['score']:.2%}\n"
search_results += f" Full context: {text}\n"
result_count += 1
if result_count >= 3:
break
if not search_results:
return f"No matches found for '{query}'"
return f"""π AI-Enhanced Search Results for '{query}':
{search_results}"""
else:
return f"Error: Unknown mode '{analysis_mode}'"
except Exception as e:
return f"Error processing webpage: {str(e)}" |