Spaces:

valory
/

olas-prediction-live-dashboard

Running

App Files Files Community

rosacastillo commited on Sep 16, 2024

Commit

03219e6

1 Parent(s): eac9f54

updating scripts for data collection with new tools pipeline

Browse files

Files changed (5) hide show

scripts/get_mech_info.py +134 -9
scripts/mech_request_utils.py +560 -0
scripts/profitability.py +13 -4
scripts/pull_data.py +14 -10
scripts/tools.py +124 -4

scripts/get_mech_info.py CHANGED Viewed

@@ -1,14 +1,22 @@
-from dataclasses import dataclass
 from string import Template
 from typing import Any
 from datetime import datetime, timedelta, UTC
-from utils import SUBGRAPH_API_KEY
 import requests
 OLD_MECH_SUBGRAPH_URL = (
     "https://api.thegraph.com/subgraphs/name/stakewise/ethereum-gnosis"
 )
-# MECH_SUBGRAPH_URL = "https://api.studio.thegraph.com/query/57238/mech/0.0.2"
 NETWORK_SUBGRAPH_URL = Template(
     """https://gateway-arbitrum.network.thegraph.com/api/${subgraph_api_key}/subgraphs/id/FxV6YUix58SpYmLBwc9gEHkwjfkqwe1X5FJQjn8nKPyA"""
 )
@@ -17,8 +25,11 @@ SUBGRAPH_HEADERS = {
     "Accept": "application/json, multipart/mixed",
     "Content-Type": "application/json",
 }
 QUERY_BATCH_SIZE = 1000
 DATETIME_60_DAYS_AGO = datetime.now(UTC) - timedelta(days=60)
 BLOCK_NUMBER = Template(
     """
     {
@@ -38,6 +49,40 @@ BLOCK_NUMBER = Template(
     """
 )
 def fetch_block_number(timestamp_from: int, timestamp_to: int) -> dict:
     """Get a block number by its timestamp margins."""
@@ -55,9 +100,9 @@ def fetch_block_number(timestamp_from: int, timestamp_to: int) -> dict:
         json={"query": query},
         timeout=300,
     )
     result_json = response.json()
-    print(f"Response of the query={result_json}")
     blocks = result_json.get("data", {}).get("blocks", "")
     if len(blocks) == 0:
         raise ValueError(f"The query {query} did not return any results")
@@ -100,9 +145,19 @@ def get_mech_info_2024() -> dict[str, Any]:
     return MECH_TO_INFO
-def get_mech_info_last_60_days() -> dict[str, Any]:
-    """Query the subgraph to get the last 60 days of information from mech."""
     timestamp_60_days_ago = int((DATETIME_60_DAYS_AGO).timestamp())
     margin = timedelta(seconds=5)
     timestamp_60_days_ago_plus_margin = int((DATETIME_60_DAYS_AGO + margin).timestamp())
@@ -117,6 +172,12 @@ def get_mech_info_last_60_days() -> dict[str, Any]:
     if last_month_block_number == "":
         raise ValueError("Could not find a valid block number for last month data")
     MECH_TO_INFO = {
         # this block number is when the creator had its first tx ever, and after this mech's creation
@@ -130,9 +191,73 @@ def get_mech_info_last_60_days() -> dict[str, Any]:
             last_month_block_number,
         ),
     }
     return MECH_TO_INFO
 if __name__ == "__main__":
-    result = get_mech_info_last_60_days()
-    print(result)

 from string import Template
 from typing import Any
 from datetime import datetime, timedelta, UTC
+from utils import SUBGRAPH_API_KEY, measure_execution_time, DATA_DIR
 import requests
+from mech_request_utils import (
+    collect_all_mech_delivers,
+    collect_all_mech_requests,
+    clean_mech_delivers,
+    fix_duplicate_requestIds,
+    merge_requests_delivers,
+    get_ipfs_data,
+    only_delivers_loop,
+)
 OLD_MECH_SUBGRAPH_URL = (
     "https://api.thegraph.com/subgraphs/name/stakewise/ethereum-gnosis"
 )
 NETWORK_SUBGRAPH_URL = Template(
     """https://gateway-arbitrum.network.thegraph.com/api/${subgraph_api_key}/subgraphs/id/FxV6YUix58SpYmLBwc9gEHkwjfkqwe1X5FJQjn8nKPyA"""
 )
     "Accept": "application/json, multipart/mixed",
     "Content-Type": "application/json",
 }
 QUERY_BATCH_SIZE = 1000
 DATETIME_60_DAYS_AGO = datetime.now(UTC) - timedelta(days=60)
+DATETIME_10_DAYS_AGO = datetime.now(UTC) - timedelta(days=10)
+DATETIME_10_HOURS_AGO = datetime.now(UTC) - timedelta(hours=10)
 BLOCK_NUMBER = Template(
     """
     {
     """
 )
+LATEST_BLOCK_QUERY = """
+    {
+        blocks(
+            first: 1,
+            orderBy: timestamp,
+            orderDirection: desc,
+        ){
+            id,
+            number,
+        }
+    }
+    """
+def fetch_last_block_number() -> dict:
+    # print(f"Sending query for the subgraph = {query}")
+    network_subgraph_url = NETWORK_SUBGRAPH_URL.substitute(
+        subgraph_api_key=SUBGRAPH_API_KEY
+    )
+    query = LATEST_BLOCK_QUERY
+    response = requests.post(
+        network_subgraph_url,
+        headers=SUBGRAPH_HEADERS,
+        json={"query": query},
+        timeout=300,
+    )
+    result_json = response.json()
+    print(f"Response of the query={result_json}")
+    blocks = result_json.get("data", {}).get("blocks", "")
+    if len(blocks) == 0:
+        raise ValueError(f"The query {query} did not return any results")
+    return blocks[0]
 def fetch_block_number(timestamp_from: int, timestamp_to: int) -> dict:
     """Get a block number by its timestamp margins."""
         json={"query": query},
         timeout=300,
     )
+    # print(f"block query: {query}")
     result_json = response.json()
+    # print(f"Response of the query={result_json}")
     blocks = result_json.get("data", {}).get("blocks", "")
     if len(blocks) == 0:
         raise ValueError(f"The query {query} did not return any results")
     return MECH_TO_INFO
+def get_last_block_number() -> int:
+    last_block_number = fetch_last_block_number()
+    # expecting only one block
+    last_block_number = last_block_number.get("number", "")
+    if last_block_number.isdigit():
+        last_block_number = int(last_block_number)
+    if last_block_number == "":
+        raise ValueError("Could not find a valid block number for last month data")
+    return last_block_number
+def get_last_60_days_block_number() -> int:
     timestamp_60_days_ago = int((DATETIME_60_DAYS_AGO).timestamp())
     margin = timedelta(seconds=5)
     timestamp_60_days_ago_plus_margin = int((DATETIME_60_DAYS_AGO + margin).timestamp())
     if last_month_block_number == "":
         raise ValueError("Could not find a valid block number for last month data")
+    return last_month_block_number
+def get_mech_info_last_60_days() -> dict[str, Any]:
+    """Query the subgraph to get the last 60 days of information from mech."""
+    last_month_block_number = get_last_60_days_block_number()
     MECH_TO_INFO = {
         # this block number is when the creator had its first tx ever, and after this mech's creation
             last_month_block_number,
         ),
     }
+    print(f"last 60 days block number {last_month_block_number}")
     return MECH_TO_INFO
+def get_mech_info_last_10_days() -> dict[str, Any]:
+    """Query the subgraph to get the last 10 days of information from mech."""
+    timestamp_10_days_ago = int((DATETIME_10_DAYS_AGO).timestamp())
+    margin = timedelta(seconds=5)
+    timestamp_10_days_ago_plus_margin = int((DATETIME_10_DAYS_AGO + margin).timestamp())
+    last_month_block_number = fetch_block_number(
+        timestamp_10_days_ago, timestamp_10_days_ago_plus_margin
+    )
+    # expecting only one block
+    last_month_block_number = last_month_block_number.get("number", "")
+    if last_month_block_number.isdigit():
+        last_month_block_number = int(last_month_block_number)
+    if last_month_block_number == "":
+        raise ValueError("Could not find a valid block number for last month data")
+    MECH_TO_INFO = {
+        # this block number is when the creator had its first tx ever, and after this mech's creation
+        "0xff82123dfb52ab75c417195c5fdb87630145ae81": (
+            "old_mech_abi.json",
+            last_month_block_number,
+        ),
+        # this block number is when this mech was created
+        "0x77af31de935740567cf4ff1986d04b2c964a786a": (
+            "new_mech_abi.json",
+            last_month_block_number,
+        ),
+    }
+    print(f"last 10 days block number {last_month_block_number}")
+    return MECH_TO_INFO
+@measure_execution_time
+def get_mech_events_last_60_days():
+    earliest_block_number = get_last_60_days_block_number()
+    last_block_number = get_last_block_number()
+    # mech requests
+    requests_dict, duplicatedReqId = collect_all_mech_requests(
+        from_block=earliest_block_number, to_block=last_block_number
+    )
+    # mech delivers
+    delivers_dict, duplicatedIds = collect_all_mech_delivers(
+        from_block=earliest_block_number, to_block=last_block_number
+    )
+    # clean delivers
+    clean_mech_delivers()
+    # solve duplicated requestIds
+    block_map = fix_duplicate_requestIds()
+    # merge the two files into one source
+    not_found = merge_requests_delivers()
+    # Add ipfs contents
+    get_ipfs_data()
 if __name__ == "__main__":
+    get_mech_events_last_60_days()
+    # result = get_mech_info_last_60_days()
+    # print(result)

scripts/mech_request_utils.py ADDED Viewed

	@@ -0,0 +1,560 @@

+# -*- coding: utf-8 -*-
+# ------------------------------------------------------------------------------
+#
+#   Copyright 2024 Valory AG
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+# ------------------------------------------------------------------------------
+"""Script for retrieving mech requests and their delivers."""
+import os
+import json
+import time
+import pickle
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple
+from pathlib import Path
+import requests
+from gql import Client, gql
+from gql.transport.requests import RequestsHTTPTransport
+from tools import (
+    IPFS_POLL_INTERVAL,
+    GET_CONTENTS_BATCH_SIZE,
+    IRRELEVANT_TOOLS,
+    create_session,
+    request,
+)
+from tqdm import tqdm
+from markets import PEARL_CREATOR, CREATOR
+from concurrent.futures import ThreadPoolExecutor, as_completed
+NUM_WORKERS = 10
+BLOCKS_CHUNK_SIZE = 10000
+TEXT_ALIGNMENT = 30
+MINIMUM_WRITE_FILE_DELAY_SECONDS = 20
+MECH_FROM_BLOCK_RANGE = 50000
+SCRIPTS_DIR = Path(__file__).parent
+ROOT_DIR = SCRIPTS_DIR.parent
+JSON_DATA_DIR = ROOT_DIR / "json_data"
+DATA_DIR = ROOT_DIR / "data"
+IPFS_ADDRESS = "https://gateway.autonolas.tech/ipfs/"
+THEGRAPH_ENDPOINT = "https://api.studio.thegraph.com/query/57238/mech/0.0.2"
+last_write_time = 0.0
+REQUESTS_QUERY_FILTER = """
+query requests_query($sender_not_in: [Bytes!], $id_gt: Bytes, $blockNumber_gte: BigInt, $blockNumber_lte: BigInt) {
+  requests(where: {sender_not_in: $sender_not_in, id_gt: $id_gt, blockNumber_gte: $blockNumber_gte, blockNumber_lte: $blockNumber_lte},  orderBy: id, first: 1000) {
+    blockNumber
+    blockTimestamp
+    id
+    ipfsHash
+    requestId
+    sender
+    transactionHash
+  }
+}
+"""
+DELIVERS_QUERY_NO_FILTER = """
+query delivers_query($id_gt: Bytes, $blockNumber_gte: BigInt, $blockNumber_lte: BigInt) {
+  delivers(where: {id_gt: $id_gt, blockNumber_gte: $blockNumber_gte, blockNumber_lte: $blockNumber_lte}, orderBy: id, first: 1000) {
+    blockNumber
+    blockTimestamp
+    id
+    ipfsHash
+    requestId
+    sender
+    transactionHash
+  }
+}
+"""
+DELIVERS_QUERY = """
+query delivers_query($requestId: BigInt, $blockNumber_gte: BigInt, $blockNumber_lte: BigInt) {
+  delivers(where: {requestId: $requestId, blockNumber_gte: $blockNumber_gte, blockNumber_lte: $blockNumber_lte}, orderBy: blockNumber, first: 1000) {
+    blockNumber
+    blockTimestamp
+    id
+    ipfsHash
+    requestId
+    sender
+    transactionHash
+  }
+}
+"""
+MISSING_DELIVERS_QUERY = """
+query delivers_query($requestId: BigInt, $blockNumber_gte: BigInt, $blockNumber_lte: BigInt) {
+  delivers(where: {requestId: $requestId, blockNumber_gte: $blockNumber_gte, blockNumber_lte: $blockNumber_lte}, orderBy: blockNumber, first: 1000) {
+    blockNumber
+    blockTimestamp
+    id
+    ipfsHash
+    requestId
+    sender
+    transactionHash
+  }
+}
+"""
+def collect_all_mech_requests(from_block: int, to_block: int) -> Tuple:
+    print(f"Fetching all mech requests from {from_block} to {to_block}")
+    mech_requests = {}
+    duplicated_reqIds = []
+    transport = RequestsHTTPTransport(url=THEGRAPH_ENDPOINT)
+    client = Client(transport=transport, fetch_schema_from_transport=True)
+    id_gt = "0x00"
+    while True:
+        variables = {
+            "sender_not_in": [CREATOR, PEARL_CREATOR],
+            "id_gt": id_gt,
+            "blockNumber_gte": str(from_block),  # str
+            "blockNumber_lte": str(to_block),  # str
+        }
+        try:
+            response = client.execute(
+                gql(REQUESTS_QUERY_FILTER), variable_values=variables
+            )
+            items = response.get("requests", [])
+            if not items:
+                break
+            for mech_request in items:
+                if mech_request["id"] not in mech_requests:
+                    mech_requests[mech_request["id"]] = mech_request
+                else:
+                    duplicated_reqIds.append(mech_request["id"])
+        except Exception as e:
+            print(f"Error while getting the response: {e}")
+        id_gt = items[-1]["id"]
+        time.sleep(IPFS_POLL_INTERVAL)
+        print(f"New execution for id_gt = {id_gt}")
+        if len(duplicated_reqIds) > 0:
+            print(f"Number of duplicated req Ids = {len(duplicated_reqIds)}")
+        save_json_file(mech_requests, "mech_requests.json")
+    print(f"Number of requests = {len(mech_requests)}")
+    print(f"Number of duplicated req Ids = {len(duplicated_reqIds)}")
+    save_json_file(mech_requests, "mech_requests.json")
+    return mech_requests, duplicated_reqIds
+def collect_all_mech_delivers(from_block: int, to_block: int) -> Tuple:
+    print(f"Fetching all mech delivers from {from_block} to {to_block}")
+    mech_delivers = {}
+    duplicated_requestIds = []
+    transport = RequestsHTTPTransport(url=THEGRAPH_ENDPOINT)
+    client = Client(transport=transport, fetch_schema_from_transport=True)
+    to_block = (
+        to_block + MECH_FROM_BLOCK_RANGE
+    )  # there is a delay between deliver and request
+    id_gt = ""
+    while True:
+        variables = {
+            "id_gt": id_gt,
+            "blockNumber_gte": str(from_block),  # str
+            "blockNumber_lte": str(to_block),  # str
+        }
+        try:
+            response = client.execute(
+                gql(DELIVERS_QUERY_NO_FILTER), variable_values=variables
+            )
+            items = response.get("delivers", [])
+            if not items:
+                break
+            for mech_deliver in items:
+                if mech_deliver["requestId"] not in mech_delivers:
+                    mech_delivers[mech_deliver["requestId"]] = [mech_deliver]
+                else:
+                    duplicated_requestIds.append(mech_deliver["requestId"])
+                    # we will handle the duplicated later
+                    mech_delivers[mech_deliver["requestId"]].append(mech_deliver)
+        except Exception as e:
+            print(f"Error while getting the response: {e}")
+            return
+        id_gt = items[-1]["id"]
+        time.sleep(IPFS_POLL_INTERVAL)
+        print(f"New execution for id_gt = {id_gt}")
+        if len(duplicated_requestIds) > 0:
+            print(f"Number of duplicated request id = {len(duplicated_requestIds)}")
+        save_json_file(mech_delivers, "mech_delivers.json")
+    print(f"Number of delivers = {len(mech_delivers)}")
+    print(f"Number of duplicated request id = {len(duplicated_requestIds)}")
+    save_json_file(mech_delivers, "mech_delivers.json")
+    return mech_delivers, duplicated_requestIds
+def collect_missing_delivers(request_id: int, block_number: int) -> Dict[str, Any]:
+    to_block = (
+        block_number + MECH_FROM_BLOCK_RANGE
+    )  # there is a delay between deliver and request
+    print(f"Fetching all missing delivers from {block_number} to {to_block}")
+    mech_delivers = {}
+    transport = RequestsHTTPTransport(url=THEGRAPH_ENDPOINT)
+    client = Client(transport=transport, fetch_schema_from_transport=True)
+    variables = {
+        "requestId": request_id,
+        "blockNumber_gte": str(block_number),  # str
+        "blockNumber_lte": str(to_block),  # str
+    }
+    try:
+        response = client.execute(
+            gql(MISSING_DELIVERS_QUERY), variable_values=variables
+        )
+        items = response.get("delivers", [])
+        # If the user sends requests with the same values (tool, prompt, nonce) it
+        # will generate the same requestId. Therefore, multiple items can be retrieved
+        # at this point. We assume the most likely deliver to this request is the
+        # one with the closest blockNumber among all delivers with the same requestId.
+        if items:
+            return items[0]
+    except Exception as e:
+        print(f"Error while getting the response: {e}")
+    return mech_delivers
+def populate_requests_ipfs_contents(
+    session: requests.Session, mech_requests: Dict[str, Any], keys_to_traverse: list
+) -> dict:
+    updated_dict = {}
+    wrong_response_count = 0
+    for k in tqdm(
+        keys_to_traverse,
+        desc="Fetching IPFS contents for requests",
+        position=1,
+        unit="results",
+    ):
+        mech_request = mech_requests[k]
+        if "ipfsContents" not in mech_request:
+            ipfs_hash = mech_request["ipfsHash"]
+            url = f"{IPFS_ADDRESS}{ipfs_hash}/metadata.json"
+            response = request(session, url)
+            if response is None:
+                tqdm.write(f"Skipping {mech_request=}. because response was None")
+                wrong_response_count += 1
+                continue
+            try:
+                contents = response.json()
+                if contents["tool"] in IRRELEVANT_TOOLS:
+                    continue
+                mech_request["ipfsContents"] = contents
+            except requests.exceptions.JSONDecodeError:
+                tqdm.write(
+                    f"Skipping {mech_request} because of JSONDecodeError when parsing response"
+                )
+                wrong_response_count += 1
+                continue
+        updated_dict[k] = mech_request
+        time.sleep(IPFS_POLL_INTERVAL)
+    return updated_dict
+def populate_delivers_ipfs_contents(
+    session: requests.Session, mech_requests: Dict[str, Any], keys_to_traverse: list
+) -> dict:
+    """Function to complete the delivers content info from ipfs"""
+    updated_dict = {}
+    for k in tqdm(
+        keys_to_traverse,
+        desc="Fetching IPFS contents for delivers",
+        position=1,
+        unit="results",
+    ):
+        mech_request = mech_requests[k]
+        if "deliver" not in mech_request or len(mech_request["deliver"]) == 0:
+            print(f"Skipping mech request {mech_request} because of no delivers info")
+            continue
+        deliver = mech_request["deliver"]
+        if "ipfsContents" not in deliver:
+            ipfs_hash = deliver["ipfsHash"]
+            request_id = deliver["requestId"]
+            url = f"{IPFS_ADDRESS}{ipfs_hash}/{request_id}"
+            response = request(session, url)
+            if response is None:
+                tqdm.write(f"Skipping {mech_request=}.")
+                continue
+            try:
+                contents = response.json()
+                metadata = contents.get("metadata", None)
+                if metadata and contents["metadata"]["tool"] in IRRELEVANT_TOOLS:
+                    continue
+                contents.pop("cost_dict", None)
+                deliver["ipfsContents"] = contents
+            except requests.exceptions.JSONDecodeError:
+                tqdm.write(f"Skipping {mech_request} because of JSONDecodeError")
+                continue
+            except Exception:
+                tqdm.write(
+                    f"Skipping {mech_request} because of error parsing the response"
+                )
+                continue
+        updated_dict[k] = mech_request
+        time.sleep(IPFS_POLL_INTERVAL)
+    return updated_dict
+def write_mech_events_to_file(
+    mech_requests: Dict[str, Any],
+    filename: str,
+    force_write: bool = False,
+) -> None:
+    global last_write_time  # pylint: disable=global-statement
+    now = time.time()
+    if len(mech_requests) == 0:
+        return
+    filename_path = DATA_DIR / filename
+    if force_write or (now - last_write_time) >= MINIMUM_WRITE_FILE_DELAY_SECONDS:
+        with open(filename_path, "w", encoding="utf-8") as file:
+            json.dump(mech_requests, file, indent=2)
+        last_write_time = now
+def save_final_tools_json_file(data: Dict[str, Any], filename: str):
+    filename_path = DATA_DIR / filename
+    with open(filename_path, "w", encoding="utf-8") as file:
+        json.dump(data, file, indent=2)
+def save_json_file(data: Dict[str, Any], filename: str):
+    """Function to save the content into a json file"""
+    filename_path = JSON_DATA_DIR / filename
+    with open(filename_path, "w", encoding="utf-8") as file:
+        json.dump(data, file, indent=2)
+def clean_mech_delivers() -> None:
+    """Function to remove from the delivers json file the request Ids that are not in the mech requests"""
+    # read mech requests
+    with open(JSON_DATA_DIR / "mech_requests.json", "r") as file:
+        mech_requests = json.load(file)
+    list_reqIds = [mech_requests[k].get("requestId") for k in mech_requests.keys()]
+    # remove duplicated elements
+    list_reqIds = list(set(list_reqIds))
+    # remove requestIds from delivers that are not in this list
+    with open(JSON_DATA_DIR / "mech_delivers.json", "r") as file:
+        mech_delivers = json.load(file)
+    print(f"original size of the file {len(mech_delivers)}")
+    to_delete = []
+    for r in mech_delivers.keys():
+        if r not in list_reqIds:
+            to_delete.append(r)
+    for r in to_delete:
+        mech_delivers.pop(r, None)
+    print(f"final size of the file {len(mech_delivers)}")
+    save_json_file(mech_delivers, "mech_delivers.json")
+def get_request_block_numbers(
+    mech_requests: Dict[str, Any], target_req_id: int
+) -> list:
+    block_numbers = []
+    for entry in mech_requests.values():
+        if entry["requestId"] == target_req_id:
+            block_numbers.append(entry["blockNumber"])
+    return block_numbers
+def update_block_request_map(block_request_id_map: dict) -> None:
+    print("Saving block request id map info")
+    with open(JSON_DATA_DIR / "block_request_id_map.pickle", "wb") as handle:
+        pickle.dump(block_request_id_map, handle, protocol=pickle.HIGHEST_PROTOCOL)
+def fix_duplicate_requestIds() -> dict:
+    with open(JSON_DATA_DIR / "mech_delivers.json", "r") as file:
+        data_delivers = json.load(file)
+    with open(JSON_DATA_DIR / "mech_requests.json", "r") as file:
+        mech_requests = json.load(file)
+    list_request_Ids = list(data_delivers.keys())
+    list_duplicated_reqIds = []
+    for req_Id in list_request_Ids:
+        if len(data_delivers.get(req_Id)) > 1:
+            list_duplicated_reqIds.append(req_Id)
+    print(len(list_duplicated_reqIds))
+    block_request_id_map = {}
+    for req_Id in list_duplicated_reqIds:
+        # get the list of mech request block numbers for that requestId
+        block_nrs = get_request_block_numbers(mech_requests, req_Id)
+        # get the list of mech delivers
+        mech_delivers_list = data_delivers.get(req_Id)  # list of dictionaries
+        if len(block_nrs) > 1:
+            print("More than one block number was found")
+        for block_nr in block_nrs:
+            key = (block_nr, req_Id)
+            min_difference_request = min(
+                mech_delivers_list,
+                key=lambda x: abs(int(x["blockNumber"]) - int(block_nr)),
+            )
+            block_request_id_map[key] = min_difference_request
+    update_block_request_map(block_request_id_map)
+    return block_request_id_map
+def merge_requests_delivers() -> None:
+    """Function to map requests and delivers"""
+    with open(JSON_DATA_DIR / "mech_delivers.json", "r") as file:
+        mech_delivers = json.load(file)
+    with open(JSON_DATA_DIR / "mech_requests.json", "r") as file:
+        mech_requests = json.load(file)
+    # read the block map for duplicated requestIds
+    with open(JSON_DATA_DIR / "block_request_id_map.pickle", "rb") as handle:
+        # key = (block_nr, req_Id) value = delivers dictionary
+        block_request_id_map = pickle.load(handle)
+    for _, mech_req in tqdm(
+        mech_requests.items(),
+        desc=f"Merging delivers data into the mech requests",
+    ):
+        if "deliver" in mech_req:
+            continue
+        block_number_req = mech_req["blockNumber"]
+        req_Id = mech_req["requestId"]
+        # check if it is in the duplicated map
+        key = (block_number_req, req_Id)
+        if key in block_request_id_map.keys():
+            deliver_dict = block_request_id_map[key]
+        elif req_Id in mech_delivers.keys():
+            deliver_dict = mech_delivers.get(req_Id)[0]  # the value is a list
+        else:
+            print("No deliver entry found for this request Id")
+            deliver_dict = collect_missing_delivers(
+                request_id=req_Id, block_number=int(block_number_req)
+            )
+        # extract the info and append it to the original mech request dictionary
+        mech_req["deliver"] = deliver_dict
+    save_json_file(mech_requests, "merged_requests.json")
+    return
+def get_ipfs_data():
+    with open(JSON_DATA_DIR / "merged_requests.json", "r") as file:
+        mech_requests = json.load(file)
+    total_keys_to_traverse = list(mech_requests.keys())
+    updated_mech_requests = dict()
+    session = create_session()
+    print("UPDATING IPFS CONTENTS OF REQUESTS")
+    # requests
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
+        futures = []
+        for i in range(0, len(mech_requests), GET_CONTENTS_BATCH_SIZE):
+            futures.append(
+                executor.submit(
+                    populate_requests_ipfs_contents,
+                    session,
+                    mech_requests,
+                    total_keys_to_traverse[i : i + GET_CONTENTS_BATCH_SIZE],
+                )
+            )
+        for future in tqdm(
+            as_completed(futures),
+            total=len(futures),
+            desc=f"Fetching all ipfs contents from requests ",
+        ):
+            partial_dict = future.result()
+            updated_mech_requests.update(partial_dict)
+    save_final_tools_json_file(updated_mech_requests, "tools_info.json")
+    # delivers
+    print("UPDATING IPFS CONTENTS OF DELIVERS")
+    total_keys_to_traverse = list(updated_mech_requests.keys())
+    final_tools_content = {}
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
+        futures = []
+        for i in range(0, len(updated_mech_requests), GET_CONTENTS_BATCH_SIZE):
+            futures.append(
+                executor.submit(
+                    populate_delivers_ipfs_contents,
+                    session,
+                    updated_mech_requests,
+                    total_keys_to_traverse[i : i + GET_CONTENTS_BATCH_SIZE],
+                )
+            )
+        for future in tqdm(
+            as_completed(futures),
+            total=len(futures),
+            desc=f"Fetching all ipfs contents from delivers ",
+        ):
+            partial_dict = future.result()
+            final_tools_content.update(partial_dict)
+    save_final_tools_json_file(final_tools_content, "tools_info.json")
+def only_delivers_loop():
+    with open(DATA_DIR / "tools_info.json", "r") as file:
+        updated_mech_requests = json.load(file)
+    # delivers
+    session = create_session()
+    print("UPDATING IPFS CONTENTS OF DELIVERS")
+    total_keys_to_traverse = list(updated_mech_requests.keys())
+    final_tools_content = {}
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
+        futures = []
+        for i in range(0, len(updated_mech_requests), GET_CONTENTS_BATCH_SIZE):
+            futures.append(
+                executor.submit(
+                    populate_delivers_ipfs_contents,
+                    session,
+                    updated_mech_requests,
+                    total_keys_to_traverse[i : i + GET_CONTENTS_BATCH_SIZE],
+                )
+            )
+        for future in tqdm(
+            as_completed(futures),
+            total=len(futures),
+            desc=f"Fetching all ipfs contents from delivers ",
+        ):
+            partial_dict = future.result()
+            final_tools_content.update(partial_dict)
+    save_final_tools_json_file(final_tools_content, "tools_info.json")

scripts/profitability.py CHANGED Viewed

@@ -99,6 +99,7 @@ class MarketAttribute(Enum):
 ALL_TRADES_STATS_DF_COLS = [
     "trader_address",
     "trade_id",
     "creation_timestamp",
     "title",
@@ -407,6 +408,7 @@ def analyse_trader(
             earnings, winner_trade = (0, False)
             redemption = _is_redeemed(user_json, trade)
             current_answer = trade["fpmm.currentAnswer"]
             # Determine market status
             market_status = determine_market_status(trade, current_answer)
@@ -431,9 +433,16 @@ def analyse_trader(
                 winner_trade = True
             # Compute mech calls
-            num_mech_calls = (
-                tools_usage["prompt_request"].apply(lambda x: trade["title"] in x).sum()
-            )
             net_earnings = (
                 earnings
                 - fee_amount
@@ -444,6 +453,7 @@ def analyse_trader(
             # Assign values to DataFrame
             trades_df.loc[i] = {
                 "trader_address": trader_address,
                 "trade_id": trade["id"],
                 "market_status": market_status.name,
                 "creation_timestamp": creation_timestamp_utc,
@@ -548,7 +558,6 @@ def run_profitability_analysis(
         rpc, tools_filename, trades_filename, from_timestamp
     )
     tools["trader_address"] = tools["trader_address"].str.lower()
-    print(f"List of market creators = {trades["trader_address"].unique()}")
     # all trades profitability df
     print("Analysing trades...")

 ALL_TRADES_STATS_DF_COLS = [
     "trader_address",
+    "market_creator",
     "trade_id",
     "creation_timestamp",
     "title",
             earnings, winner_trade = (0, False)
             redemption = _is_redeemed(user_json, trade)
             current_answer = trade["fpmm.currentAnswer"]
+            market_creator = trade["market_creator"]
             # Determine market status
             market_status = determine_market_status(trade, current_answer)
                 winner_trade = True
             # Compute mech calls
+            try:
+                num_mech_calls = (
+                    tools_usage["prompt_request"]
+                    .apply(lambda x: trade["title"] in x)
+                    .sum()
+                )
+            except Exception:
+                print(f"Error while getting the number of mech calls")
+                num_mech_calls = 0  # No info
             net_earnings = (
                 earnings
                 - fee_amount
             # Assign values to DataFrame
             trades_df.loc[i] = {
                 "trader_address": trader_address,
+                "market_creator": market_creator,
                 "trade_id": trade["id"],
                 "market_status": market_status.name,
                 "creation_timestamp": creation_timestamp_utc,
         rpc, tools_filename, trades_filename, from_timestamp
     )
     tools["trader_address"] = tools["trader_address"].str.lower()
     # all trades profitability df
     print("Analysing trades...")

scripts/pull_data.py CHANGED Viewed

@@ -16,10 +16,13 @@ from tools import (
     etl as tools_etl,
     DEFAULT_FILENAME as TOOLS_FILENAME,
     update_tools_accuracy,
 )
 from profitability import run_profitability_analysis
 from utils import get_question, current_answer, RPC
-from get_mech_info import get_mech_info_last_60_days
 from update_tools_accuracy import compute_tools_accuracy
 import gc
@@ -122,21 +125,20 @@ def updating_timestamps(rpc: str):
 def weekly_analysis():
     """Run weekly analysis for the FPMMS project."""
     rpc = RPC
     # Run markets ETL
     logging.info("Running markets ETL")
     mkt_etl(MARKETS_FILENAME)
     logging.info("Markets ETL completed")
-    # Run tools ETL
-    logging.info("Running tools ETL")
-    # This etl is saving already the tools parquet file
-    tools_etl(
-        rpcs=[rpc],
-        mech_info=get_mech_info_last_60_days(),
-        filename=TOOLS_FILENAME,
-    )
     logging.info("Tools ETL completed")
     # Run profitability analysis
@@ -146,6 +148,7 @@ def weekly_analysis():
     run_profitability_analysis(
         rpc=rpc,
     )
     logging.info("Profitability analysis completed")
     add_current_answer()
     try:
@@ -163,3 +166,4 @@ if __name__ == "__main__":
     weekly_analysis()
     # rpc = RPC
     # updating_timestamps(rpc)

     etl as tools_etl,
     DEFAULT_FILENAME as TOOLS_FILENAME,
     update_tools_accuracy,
+    generate_tools_file,
 )
 from profitability import run_profitability_analysis
 from utils import get_question, current_answer, RPC
+from get_mech_info import (
+    get_mech_events_last_60_days,
+)
 from update_tools_accuracy import compute_tools_accuracy
 import gc
 def weekly_analysis():
     """Run weekly analysis for the FPMMS project."""
     rpc = RPC
     # Run markets ETL
     logging.info("Running markets ETL")
     mkt_etl(MARKETS_FILENAME)
     logging.info("Markets ETL completed")
+    # New tools ETL
+    logging.info("Generating the mech json files")
+    get_mech_events_last_60_days()
+    logging.info("Finished generating the mech json files")
+    # Run tools ETL
+    logging.info("Running new tools ETL")
+    get_mech_events_last_60_days()
+    generate_tools_file()
     logging.info("Tools ETL completed")
     # Run profitability analysis
     run_profitability_analysis(
         rpc=rpc,
     )
     logging.info("Profitability analysis completed")
     add_current_answer()
     try:
     weekly_analysis()
     # rpc = RPC
     # updating_timestamps(rpc)
+    # compute_tools_accuracy()

scripts/tools.py CHANGED Viewed

@@ -18,7 +18,7 @@
 #   ------------------------------------------------------------------------------
 import os.path
-import re
 import time
 import random
 from typing import (
@@ -66,6 +66,10 @@ from utils import (
     HTTP,
     HTTPS,
     REQUEST_SENDER,
 )
 CONTRACTS_PATH = "contracts"
@@ -184,7 +188,7 @@ def get_events(
                         f"An error was raised from the RPC: {exc}\n Retrying in {sleep} seconds."
                     )
                     if hasattr(exc, "message"):
-                        tqdm.write(f"Error message: {exc.messge}\n")
                     time.sleep(sleep)
             from_block += batch_size
@@ -201,6 +205,7 @@ def get_events(
 def parse_events(raw_events: List) -> List[MechEvent]:
     """Parse all the specified MechEvents."""
     parsed_events = []
     for event in raw_events:
@@ -215,6 +220,24 @@ def parse_events(raw_events: List) -> List[MechEvent]:
     return parsed_events
 def create_session() -> requests.Session:
     """Create a session with a retry strategy."""
     session = requests.Session()
@@ -258,7 +281,7 @@ def parse_ipfs_response(
         return response.json()
     except requests.exceptions.JSONDecodeError:
         # this is a workaround because the `metadata.json` file was introduced and removed multiple times
-        if event_name == MechEventName.REQUEST and url != event.ipfs_request_link:
             url = event.ipfs_request_link
             response = request(session, url)
             if response is None:
@@ -320,6 +343,54 @@ def get_contents(
     return pd.DataFrame(contents)
 def transform_request(contents: pd.DataFrame) -> pd.DataFrame:
     """Transform the requests dataframe."""
     return clean(contents)
@@ -370,7 +441,7 @@ def store_progress(
                 content.to_parquet(DATA_DIR / event_filename, index=False)
             except Exception as e:
                 print(f"Failed to write {event_name} data: {e}")
-        # Drop result and error columns for tools DataFrame
         try:
             if "result" in tools.columns:
                 tools = tools.drop(columns=["result"])
@@ -487,6 +558,55 @@ def etl(
     return tools
 def update_tools_accuracy(
     tools_acc: pd.DataFrame, tools_df: pd.DataFrame, inc_tools: List[str]
 ) -> pd.DataFrame:

 #   ------------------------------------------------------------------------------
 import os.path
+import json
 import time
 import random
 from typing import (
     HTTP,
     HTTPS,
     REQUEST_SENDER,
+    get_result_values,
+    get_vote,
+    get_win_probability,
+    get_prediction_values,
 )
 CONTRACTS_PATH = "contracts"
                         f"An error was raised from the RPC: {exc}\n Retrying in {sleep} seconds."
                     )
                     if hasattr(exc, "message"):
+                        tqdm.write(f"Error message: {exc.message}\n")
                     time.sleep(sleep)
             from_block += batch_size
 def parse_events(raw_events: List) -> List[MechEvent]:
+    # TODO use dictionary instead of List
     """Parse all the specified MechEvents."""
     parsed_events = []
     for event in raw_events:
     return parsed_events
+def parse_dict_events(events_dict: dict) -> List[MechEvent]:
+    # TODO use dictionary instead of List
+    """Parse all the specified MechEvents."""
+    parsed_events = []
+    list_ids = list(events_dict.keys())
+    for mech_id in list_ids:
+        event = events_dict[mech_id]
+        for_block = event.get("blockNumber", 0)
+        args = event.get(EVENT_ARGUMENTS, {})
+        request_id = args.get(REQUEST_ID, 0)
+        data = args.get(DATA, b"")
+        sender = args.get(REQUEST_SENDER, "")
+        parsed_event = MechEvent(for_block, request_id, data, sender)
+        parsed_events.append(parsed_event)
+    return parsed_events
 def create_session() -> requests.Session:
     """Create a session with a retry strategy."""
     session = requests.Session()
         return response.json()
     except requests.exceptions.JSONDecodeError:
         # this is a workaround because the `metadata.json` file was introduced and removed multiple times
+        if event_name == MechEvent.REQUEST and url != event.ipfs_request_link:
             url = event.ipfs_request_link
             response = request(session, url)
             if response is None:
     return pd.DataFrame(contents)
+def parse_json_events(json_events: dict, keys_to_traverse: List[int]) -> pd.DataFrame:
+    """Function to parse the mech info in a json format"""
+    all_records = []
+    for key in keys_to_traverse:
+        try:
+            json_input = json_events[key]
+            output = {}
+            output["request_id"] = json_input["requestId"]
+            output["request_block"] = json_input["blockNumber"]
+            output["prompt_request"] = json_input["ipfsContents"]["prompt"]
+            output["tool"] = json_input["ipfsContents"]["tool"]
+            output["nonce"] = json_input["ipfsContents"]["nonce"]
+            output["trader_address"] = json_input["sender"]
+            output["deliver_block"] = json_input["deliver"]["blockNumber"]
+            error_value, error_message, prediction_params = get_result_values(
+                json_input["deliver"]["ipfsContents"]["result"]
+            )
+            error_message_value = json_input.get("error_message", error_message)
+            output["error"] = error_value
+            output["error_message"] = error_message_value
+            output["prompt_response"] = json_input["deliver"]["ipfsContents"]["prompt"]
+            output["mech_address"] = json_input["deliver"]["sender"]
+            p_yes_value, p_no_value, confidence_value, info_utility_value = (
+                get_prediction_values(prediction_params)
+            )
+            output["p_yes"] = p_yes_value
+            output["p_no"] = p_no_value
+            output["confidence"] = confidence_value
+            output["info_utility"] = info_utility_value
+            output["vote"] = get_vote(p_yes_value, p_no_value)
+            output["win_probability"] = get_win_probability(p_yes_value, p_no_value)
+            all_records.append(output)
+        except Exception as e:
+            print(e)
+            print(f"Error parsing the key ={key}. Noted as error")
+            output["error"] = 1
+            output["error_message"] = "Response parsing error"
+            output["p_yes"] = None
+            output["p_no"] = None
+            output["confidence"] = None
+            output["info_utility"] = None
+            output["vote"] = None
+            output["win_probability"] = None
+            all_records.append(output)
+    return pd.DataFrame.from_dict(all_records, orient="columns")
 def transform_request(contents: pd.DataFrame) -> pd.DataFrame:
     """Transform the requests dataframe."""
     return clean(contents)
                 content.to_parquet(DATA_DIR / event_filename, index=False)
             except Exception as e:
                 print(f"Failed to write {event_name} data: {e}")
+        # Drop result columns for tools DataFrame
         try:
             if "result" in tools.columns:
                 tools = tools.drop(columns=["result"])
     return tools
+def parse_store_json_events_parallel(
+    json_events: Dict[str, Any], filename: str = DEFAULT_FILENAME
+):
+    total_nr_events = len(json_events)
+    ids_to_traverse = list(json_events.keys())
+    print(f"Parsing {total_nr_events} events")
+    contents = []
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
+        futures = []
+        for i in range(0, total_nr_events, GET_CONTENTS_BATCH_SIZE):
+            futures.append(
+                executor.submit(
+                    parse_json_events,
+                    json_events,
+                    ids_to_traverse[i : i + GET_CONTENTS_BATCH_SIZE],
+                )
+            )
+        for future in tqdm(
+            as_completed(futures),
+            total=len(futures),
+            desc=f"Fetching json contents",
+        ):
+            current_mech_contents = future.result()
+            contents.append(current_mech_contents)
+    tools = pd.concat(contents, ignore_index=True)
+    print(f"Length of the contents dataframe {len(tools)}")
+    print(tools.info())
+    try:
+        if "result" in tools.columns:
+            tools = tools.drop(columns=["result"])
+        tools.to_parquet(DATA_DIR / filename, index=False)
+    except Exception as e:
+        print(f"Failed to write tools data: {e}")
+    return tools
+def generate_tools_file():
+    """Function to parse the json mech events and generate the parquet tools file"""
+    try:
+        with open(DATA_DIR / "tools_info.json", "r") as file:
+            file_contents = json.load(file)
+            parse_store_json_events_parallel(file_contents)
+    except Exception as e:
+        print(f"An Exception happened while parsing the json events {e}")
 def update_tools_accuracy(
     tools_acc: pd.DataFrame, tools_df: pd.DataFrame, inc_tools: List[str]
 ) -> pd.DataFrame: