File size: 8,621 Bytes
6185b4f
9f63939
6185b4f
9f63939
6185b4f
 
07cc8e5
 
 
 
 
 
 
 
 
 
 
 
 
 
6185b4f
07cc8e5
 
 
6185b4f
07cc8e5
 
 
6185b4f
07cc8e5
 
 
6185b4f
07cc8e5
 
 
6185b4f
07cc8e5
 
 
 
 
 
 
 
 
6185b4f
 
 
e225706
6185b4f
 
 
 
 
 
 
 
e225706
07cc8e5
 
 
 
6185b4f
07cc8e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6185b4f
 
 
 
 
e225706
6185b4f
 
 
 
 
 
 
 
07cc8e5
 
 
 
6185b4f
07cc8e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6185b4f
9f63939
 
 
 
07cc8e5
 
9f63939
 
 
 
 
 
 
 
 
 
 
 
07cc8e5
 
 
9f63939
07cc8e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6185b4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e225706
 
 
 
 
 
 
 
 
 
 
6185b4f
e225706
6185b4f
 
 
 
e225706
 
 
 
 
 
 
 
 
 
 
6185b4f
 
9f63939
 
e225706
 
 
 
 
 
 
 
 
 
 
 
 
 
6185b4f
9f63939
6185b4f
9f63939
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# %%
import requests
from bs4 import BeautifulSoup
import gradio as gr

def parse_news_item(html: str) -> dict:
    """
    Parse HTML of a news item to extract link, time, headline, and text.
    
    Args:
        html: The HTML string of a news item.
        
    Returns:
        A dictionary containing link, time, headline, and text.
        
    Raises:
        Exception: For parsing errors or other unexpected errors.
    """
    try:
        soup = BeautifulSoup(html, "html.parser")

        # Get the anchor tag containing the link
        link_tag = soup.find("a", href=True)
        link = link_tag["href"] if link_tag else None

        # Get the headline inside <h3>
        headline_tag = soup.find("h3", class_="story__headline")
        headline = headline_tag.get_text(strip=True) if headline_tag else None

        # Get the text inside <p>
        text_tag = soup.find("p", class_="story__text")
        text = text_tag.get_text(strip=True) if text_tag else None

        # Get the time inside <time>
        time_tag = soup.find("time")
        time = time_tag.get_text(strip=True) if time_tag else None

        return {
            "link": link,
            "time": time,
            "headline": headline,
            "text": text,
        }
    except Exception as e:
        print(f"Error parsing news item: {e}")
        raise


# %%
def search_news(keyword, page=1) -> list:
    """
    Fetch news articles related to a keyword from udn.com.
    
    Args:
        keyword: The search keyword for news articles.
        page: The page number to fetch (default is 1).
    
    Returns:
        A list of dictionaries containing link, time, headline and text of news article data.
        
    Raises:
        requests.RequestException: If there's an error fetching data from the URL.
        Exception: For other unexpected errors.
    """
    try:
        url = f"https://money.udn.com/search/result/1001/{keyword}/{page}"
        response = requests.get(url)
        
        if response.status_code != 200:
            raise requests.RequestException(f"Failed to retrieve data: {response.status_code}")
        
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.select('div > div > main > section > ul > li')
        
        results = []
        for article in articles:
            try:
                article_html = article.prettify()
                data = parse_news_item(article_html)
                # change dict to list
                data_list = list(data.values())
                results.append(data_list)
            except Exception as e:
                print(f"Error parsing article: {e}")
                continue
        
        return results
    except requests.RequestException as e:
        print(f"Network error in search_news: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error in search_news: {e}")
        raise

# search_news('台積電', 1)  # Example usage to fetch news articles related to '台積電'

# %%
# write a function to get the url and parse the content
def get_content(url) -> dict:
    """
    Fetch and parse the content of a given URL.
    
    Args:
        url: The URL to fetch and parse.

    Returns:
        A dictionary containing the title, text content, and HTML of the page.
        
    Raises:
        requests.RequestException: If there's an error fetching data from the URL.
        Exception: For other unexpected errors.
    """
    try:
        response = requests.get(url)

        if response.status_code != 200:
            raise requests.RequestException(f"Failed to retrieve {url}: {response.status_code}")

        soup = BeautifulSoup(response.text, 'html.parser')

        # using select to get the text inside the #article_body
        # This assumes the content is inside an element with id="article_body"
        article_body = soup.select_one('#article_body')
        text_content = ''
        if article_body:
            text_content = article_body.get_text(separator='\n', strip=True)

        return {
            'link': url,
            'title': soup.title.string if soup.title else 'No title',
            'text': text_content
        }
    except requests.RequestException as e:
        print(f"Network error in get_content: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error in get_content: {e}")
        raise

# %%
from smolagents import Tool, CodeAgent, LiteLLMModel, ToolCollection, ActionStep, FinalAnswerStep
import os

model_name = os.environ.get("AI_MODEL", "openrouter/qwen/qwen-2.5-coder-32b-instruct:free")
model = LiteLLMModel(model_name, api_key=os.environ["OPENROUTER_API_KEY"])
url = "https://robin0307-newsmcp.hf.space/gradio_api/mcp/sse"
server_parameters = {"url": url, "transport": "sse"}

def newsAgent(task: str) -> str:
    """
    News Agent to handle the news task.
    
    Args:
        task: The task description.

    Returns:
        The result of the Task.
        
    Raises:
        Exception: For errors during agent execution.
    """
    try:
        result = ""
        with ToolCollection.from_mcp(server_parameters, trust_remote_code=True) as mcp_tools:
            agent = CodeAgent(tools=[*mcp_tools.tools[:2]], model=model)
            for event in agent.run(task, stream=True, max_steps=5):
                if isinstance(event, ActionStep):
                    result += f"\n## ======Step {event.step_number}======\n### Action\n```python\n{event.code_action}\n```\n### Observation\n{event.observations}"
                    # yield result
                if isinstance(event, FinalAnswerStep):
                    result += f"\n## ======Final======\n{event.output}"
                    # yield result
        return result
    except Exception as e:
        error_msg = f"Error in newsAgent: {e}"
        print(error_msg)
        raise Exception(error_msg) from e

# get_content('https://money.udn.com/money/story/5612/8832289?from=edn_search_result')  # Example usage to fetch content from a specific URL

# %%
# using the gradio to create two tab 
# 1. search news
# 2. get content from url
def main():
    with gr.Blocks() as demo:
        gr.Markdown("# News Search and Content Fetcher")
        
        with gr.Tab("Search News"):
            keyword = gr.Textbox(label="Keyword", placeholder="Enter keyword to search news")
            page = gr.Number(label="Page Number", value=1, step=1)
            search_button = gr.Button("Search")
            search_results = gr.DataFrame(label="Search Results", headers=["Link", "Time", "Headline", "Text"])
            # Examples for Search News tab
            gr.Examples(
                examples=[
                    ["AI", 1],
                    ["華碩", 2]
                ],
                inputs=[keyword, page],
                outputs=search_results,
                fn=search_news,
                cache_examples=False
            )
            search_button.click(search_news, inputs=[keyword, page], outputs=search_results)
            
        
        with gr.Tab("Get Content from URL"):
            url_input = gr.Textbox(label="URL", placeholder="Enter URL to fetch content")
            content_output = gr.JSON(label="Content Output")
            # Examples for Get Content of News tab
            gr.Examples(
                examples=[
                    ["https://money.udn.com/money/story/5722/8870335?from=edn_search_result"],
                    ["https://money.udn.com/money/story/5612/8868152?from=edn_search_result"]
                ],
                inputs=[url_input],
                outputs=content_output,
                fn=get_content,
                cache_examples=False
            )
            url_input.submit(get_content, inputs=url_input, outputs=content_output)

        with gr.Tab("News Agent"):
            agent_input = gr.Textbox(label="Task", placeholder="Enter the task")
            # run_button = gr.Button("Run")
            result_output = gr.Markdown(label="Result")
             # Examples for Get Content of News tab
            gr.Examples(
                examples=[
                    ["華碩今日新聞"],
                    ["華碩和Nvidia今日新聞"]
                ],
                inputs=[agent_input],
                outputs=result_output,
                fn=newsAgent,
                cache_examples=True
            )
            agent_input.submit(newsAgent, inputs=agent_input, outputs=result_output)

    demo.launch(mcp_server=True, server_name="0.0.0.0",allowed_paths=["/"], share=True)

if __name__ == "__main__":
    main()