File size: 6,598 Bytes
10e9b7d
2707bf9
fffffb0
2794b4c
 
904e0bd
2707bf9
 
 
 
fffffb0
1803c5e
 
 
2707bf9
1803c5e
2707bf9
fffffb0
 
2707bf9
fffffb0
 
2707bf9
 
 
fffffb0
2707bf9
fffffb0
 
2794b4c
fffffb0
2707bf9
fffffb0
393a8af
fffffb0
2707bf9
fffffb0
393a8af
fffffb0
 
2707bf9
fffffb0
2707bf9
1803c5e
 
2707bf9
fffffb0
2707bf9
fffffb0
2707bf9
fffffb0
3628aaf
1803c5e
 
393a8af
1803c5e
2707bf9
fffffb0
1803c5e
393a8af
fffffb0
1803c5e
 
2707bf9
 
 
 
 
 
 
 
 
 
1803c5e
 
2707bf9
393a8af
177be6f
2707bf9
 
 
 
177be6f
d8dafef
2707bf9
 
 
d8dafef
 
177be6f
 
 
 
2707bf9
 
 
177be6f
2707bf9
177be6f
 
 
2707bf9
177be6f
d8dafef
177be6f
393a8af
177be6f
 
2707bf9
 
 
 
177be6f
 
2707bf9
393a8af
2707bf9
 
 
 
 
177be6f
2707bf9
 
 
 
 
d8dafef
393a8af
2707bf9
177be6f
393a8af
2707bf9
 
393a8af
177be6f
 
 
2707bf9
 
 
 
393a8af
 
3628aaf
2707bf9
 
 
177be6f
2707bf9
d8dafef
2707bf9
177be6f
2707bf9
177be6f
2707bf9
 
 
 
 
 
393a8af
 
177be6f
2707bf9
 
3628aaf
393a8af
3628aaf
393a8af
2707bf9
3628aaf
177be6f
 
2707bf9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import os
import gradio as gr
import requests
import pandas as pd
from openai import OpenAI

# Constants
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

# ToolEnhancedAgent menggunakan OpenAI API terbaru (1.x)
class ToolEnhancedAgent:
    def __init__(self):
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY not found in environment variables.")
        self.client = OpenAI(api_key=api_key)
        print("ToolEnhancedAgent initialized with OpenAI GPT model.")

    def use_tool(self, tool_name: str, input_text: str) -> str:
        # Contoh penggunaan tool sederhana: kalkulator, tanggal, Wikipedia
        try:
            if tool_name == "calculator":
                # Aman eval dengan math
                import math
                return str(eval(input_text, {"__builtins__": None, "math": math}))
            elif tool_name == "date":
                import datetime
                return str(datetime.datetime.now().date())
            elif tool_name == "wikipedia":
                return self.search_wikipedia(input_text)
            else:
                return "[Tool Error: Unknown tool]"
        except Exception as e:
            return f"[Tool Error: {e}]"

    def search_wikipedia(self, query: str) -> str:
        try:
            res = requests.get(f"https://en.wikipedia.org/api/rest_v1/page/summary/{query}")
            if res.status_code == 200:
                return res.json().get("extract", "No summary found.")
            return f"No Wikipedia summary for {query}."
        except Exception as e:
            return f"Wikipedia Error: {e}"

    def __call__(self, question: str) -> str:
        # Prompt dengan Chain of Thought dan instruksi penggunaan tools
        prompt = (
            "You are an AI assistant that can think step-by-step and use tools when needed.\n"
            f"Question: {question}\n"
            "Answer with your reasoning steps. If needed, mention the tool you want to use like [calculator], [date], [wikipedia]."
        )

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant using tools and reasoning."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                max_tokens=700,
            )
            answer = response.choices[0].message.content.strip()
            # Simple tool simulation: jika ada tag [tool:toolname] di jawaban, gunakan tool dan tambahkan hasilnya
            # Contoh: "[calculator] 2+2" -> hitung 4 dan tambahkan ke jawaban
            import re
            pattern = r"\[([a-z]+)\](.*)"
            match = re.search(pattern, answer, re.IGNORECASE)
            if match:
                tool_name = match.group(1).lower()
                tool_input = match.group(2).strip()
                tool_result = self.use_tool(tool_name, tool_input)
                answer += f"\n\n[Tool used: {tool_name}]\nResult: {tool_result}"
            return answer
        except Exception as e:
            print(f"Agent error: {e}")
            return f"[Agent Error: {e}]"

# Revisi run_and_submit_all untuk menerima profile (LoginButton output)
def run_and_submit_all(profile: gr.OAuthProfile | None):
    if profile is None:
        return "Please login with your Hugging Face account.", None

    username = profile.username
    space_id = os.getenv("SPACE_ID") or "your-username/your-space"  # Ganti sesuai space kamu jika perlu

    api_url = DEFAULT_API_URL
    questions_url = f"{api_url}/questions"
    submit_url = f"{api_url}/submit"

    try:
        agent = ToolEnhancedAgent()
    except Exception as e:
        return f"Error initializing agent: {e}", None

    agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main"

    # Ambil pertanyaan
    try:
        response = requests.get(questions_url, timeout=15)
        response.raise_for_status()
        questions_data = response.json()
    except Exception as e:
        return f"Error fetching questions: {e}", None

    answers_payload = []
    results_log = []

    for item in questions_data:
        task_id = item.get("task_id")
        question_text = item.get("question")
        if not task_id or question_text is None:
            continue
        try:
            answer = agent(question_text)
            answers_payload.append({"task_id": task_id, "submitted_answer": answer})
            results_log.append({
                "Task ID": task_id,
                "Question": question_text,
                "Submitted Answer": answer,
            })
        except Exception as e:
            results_log.append({
                "Task ID": task_id,
                "Question": question_text,
                "Submitted Answer": f"Agent Error: {e}",
            })

    if not answers_payload:
        return "Agent did not produce answers to submit.", pd.DataFrame(results_log)

    submission_data = {
        "username": username.strip(),
        "agent_code": agent_code_url,
        "answers": answers_payload,
    }

    try:
        submit_response = requests.post(submit_url, json=submission_data, timeout=60)
        submit_response.raise_for_status()
        result = submit_response.json()

        status = (
            f"Submission Successful!\n"
            f"User: {result.get('username')}\n"
            f"Score: {result.get('score', 'N/A')}% "
            f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')} correct)\n"
            f"Message: {result.get('message', 'No message')}"
        )
        return status, pd.DataFrame(results_log)
    except Exception as e:
        return f"Submission failed: {e}", pd.DataFrame(results_log)

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# GAIA Benchmark Agent Runner")
    gr.Markdown("""
    1. Login with your Hugging Face account.
    2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, and submit answers.
    """)

    login_btn = gr.LoginButton()
    run_btn = gr.Button("Run Evaluation & Submit All Answers")

    status_out = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
    results_df = gr.DataFrame(label="Questions and Agent Answers", wrap=True)

    run_btn.click(
        fn=run_and_submit_all,
        inputs=[login_btn],
        outputs=[status_out, results_df]
    )

if __name__ == "__main__":
    demo.launch(debug=True, share=False)