File size: 8,994 Bytes
4ec04a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
from smolagents import Tool
from typing import Any, Optional

class SimpleTool(Tool):
    name = "advanced_web_analyzer"
    description = "Advanced web content analyzer with ML-powered analysis capabilities."
    inputs = {"url":{"type":"string","description":"The webpage URL to analyze."},"analysis_mode":{"type":"string","nullable":True,"description":"Analysis mode ('analyze', 'search', 'summarize', 'sentiment', 'topics')."},"query":{"type":"string","nullable":True,"description":"Optional search term for 'search' mode."},"language":{"type":"string","nullable":True,"description":"Content language (default: 'en')."}}
    output_type = "string"

    def forward(self, url: str, 
        analysis_mode: str = "analyze", 
        query: Optional[str] = None,
        language: str = "en") -> str:
        """Advanced web content analyzer with ML-powered analysis capabilities.

        Args:
            url: The webpage URL to analyze.
            analysis_mode: Analysis mode ('analyze', 'search', 'summarize', 'sentiment', 'topics'). 
            query: Optional search term for 'search' mode.
            language: Content language (default: 'en').

        Returns:
            str: Advanced analysis of web content.
        """
        import requests
        from bs4 import BeautifulSoup
        from urllib.parse import urlparse
        import re
        from collections import Counter
        from transformers import pipeline

        try:
            # Validate URL
            parsed_url = urlparse(url)
            if not all([parsed_url.scheme, parsed_url.netloc]):
                return "Error: Invalid URL format. Please provide a valid URL."

            # Fetch webpage
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            # Parse content
            soup = BeautifulSoup(response.text, 'html.parser')
            for tag in soup(['script', 'style', 'meta']):
                tag.decompose()

            # Extract basic elements
            title = soup.title.string if soup.title else "No title found"
            title = re.sub(r'\s+', ' ', title).strip() if title else "No title found"

            text_content = soup.get_text()
            text_content = re.sub(r'\s+', ' ', text_content).strip()

            # Process based on mode
            if analysis_mode == "analyze":
                # Initialize ML pipelines
                summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
                classifier = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")

                # Get word statistics
                words = text_content.lower().split()
                word_count = len(words)
                word_freq = Counter(words).most_common(5)
                common_words = ""
                for word, count in word_freq:
                    common_words += f"- {word}: {count} times\n"

                # Get summary
                summary = summarizer(text_content[:1024], max_length=100, min_length=30)[0]['summary_text']

                # Get sentiment
                sentiment = classifier(text_content[:512])[0]
                sentiment_score = int(sentiment['label'][0]) # Convert '5 stars' to number
                sentiment_desc = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"][sentiment_score-1]

                # Format comprehensive analysis
                return f"""πŸ” Comprehensive Content Analysis

    πŸ“‘ Basic Information:
    Title: {title}
    Word Count: {word_count}
    Reading Time: {word_count // 200} minutes

    πŸ“ Quick Summary:
    {summary}

    😊 Content Sentiment:
    {sentiment_desc} ({sentiment_score}/5 stars)

    πŸ“Š Most Common Words:
    {common_words}"""

            elif analysis_mode == "summarize":
                # Use BART for better summarization
                summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

                # Split into chunks if text is too long
                chunks = [text_content[i:i+1024] for i in range(0, len(text_content), 1024)]
                summaries = []

                for chunk in chunks[:3]:  # Process up to 3 chunks
                    if len(chunk.strip()) > 100:
                        summary = summarizer(chunk, max_length=100, min_length=30)[0]['summary_text']
                        summaries.append(summary)

                return f"""πŸ“ Content Summary for: {title}

    {' '.join(summaries)}"""

            elif analysis_mode == "sentiment":
                # Use multilingual sentiment analyzer
                classifier = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")

                # Analyze main content and paragraphs
                main_sentiment = classifier(text_content[:512])[0]

                # Analyze individual paragraphs
                paragraphs = soup.find_all('p')
                detailed_sentiments = ""
                para_count = 0

                for p in paragraphs:
                    text = p.text.strip()
                    if len(text) > 50:  # Only analyze meaningful paragraphs
                        sentiment = classifier(text[:512])[0]
                        score = int(sentiment['label'][0])
                        mood = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"][score-1]
                        detailed_sentiments += f"\nParagraph {para_count + 1}: {mood} ({score}/5)"
                        para_count += 1
                        if para_count >= 5:  # Limit to 5 paragraphs
                            break

                return f"""😊 Sentiment Analysis

    Overall Sentiment: {["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"][int(main_sentiment['label'][0])-1]}
    Overall Score: {main_sentiment['label'][0]}/5

    Detailed Analysis:{detailed_sentiments}"""

            elif analysis_mode == "topics":
                # Use Zero-shot classification for topic detection
                classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

                # Define potential topics
                topics = [
                    "Technology", "Business", "Politics", "Science",
                    "Health", "Entertainment", "Sports", "Education",
                    "Environment", "Culture"
                ]

                # Analyze main content
                topic_results = classifier(text_content[:512], topics)

                # Format results
                topic_analysis = "Main Topics:\n"
                for topic, score in zip(topic_results['labels'], topic_results['scores']):
                    if score > 0.1:  # Only show relevant topics
                        topic_analysis += f"- {topic}: {score*100:.1f}% confidence\n"

                # Get key phrases
                from keybert import KeyBERT
                kw_model = KeyBERT()
                keywords = kw_model.extract_keywords(text_content[:5000], keyphrase_ngram_range=(1, 2), stop_words='english', top_n=5)

                key_phrases = "\nKey Phrases:\n"
                for phrase, score in keywords:
                    key_phrases += f"- {phrase}: {score:.2f} relevance\n"

                return f"""🎯 Topic Analysis

    {topic_analysis}
    {key_phrases}"""

            elif analysis_mode == "search":
                if not query:
                    return "Error: Search query is required for search mode."

                # Use transformers for better search
                qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

                # Search in paragraphs
                paragraphs = soup.find_all('p')
                search_results = ""
                result_count = 0

                for p in paragraphs:
                    text = p.text.strip()
                    if len(text) > 50 and query.lower() in text.lower():
                        # Get AI-enhanced answer
                        qa_result = qa_pipeline(question=query, context=text)
                        search_results += f"\n{result_count + 1}. Found in context: {qa_result['answer']}\n"
                        search_results += f"   Confidence: {qa_result['score']:.2%}\n"
                        search_results += f"   Full context: {text}\n"
                        result_count += 1
                        if result_count >= 3:
                            break

                if not search_results:
                    return f"No matches found for '{query}'"

                return f"""πŸ” AI-Enhanced Search Results for '{query}':
    {search_results}"""

            else:
                return f"Error: Unknown mode '{analysis_mode}'"

        except Exception as e:
            return f"Error processing webpage: {str(e)}"