File size: 6,805 Bytes
129cd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""Util that calls Metaphor Search API.

In order to set this up, follow instructions at:
"""
import json
from typing import Dict, List, Optional

import aiohttp
import requests
from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator

from langchain.utils import get_from_dict_or_env

METAPHOR_API_URL = "https://api.metaphor.systems"


class MetaphorSearchAPIWrapper(BaseModel):
    """Wrapper for Metaphor Search API."""

    metaphor_api_key: str
    k: int = 10

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid

    def _metaphor_search_results(
        self,
        query: str,
        num_results: int,
        include_domains: Optional[List[str]] = None,
        exclude_domains: Optional[List[str]] = None,
        start_crawl_date: Optional[str] = None,
        end_crawl_date: Optional[str] = None,
        start_published_date: Optional[str] = None,
        end_published_date: Optional[str] = None,
        use_autoprompt: Optional[bool] = None,
    ) -> List[dict]:
        headers = {"X-Api-Key": self.metaphor_api_key}
        params = {
            "numResults": num_results,
            "query": query,
            "includeDomains": include_domains,
            "excludeDomains": exclude_domains,
            "startCrawlDate": start_crawl_date,
            "endCrawlDate": end_crawl_date,
            "startPublishedDate": start_published_date,
            "endPublishedDate": end_published_date,
            "useAutoprompt": use_autoprompt,
        }
        response = requests.post(
            # type: ignore
            f"{METAPHOR_API_URL}/search",
            headers=headers,
            json=params,
        )

        response.raise_for_status()
        search_results = response.json()
        return search_results["results"]

    @root_validator(pre=True)
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that api key and endpoint exists in environment."""
        metaphor_api_key = get_from_dict_or_env(
            values, "metaphor_api_key", "METAPHOR_API_KEY"
        )
        values["metaphor_api_key"] = metaphor_api_key

        return values

    def results(
        self,
        query: str,
        num_results: int,
        include_domains: Optional[List[str]] = None,
        exclude_domains: Optional[List[str]] = None,
        start_crawl_date: Optional[str] = None,
        end_crawl_date: Optional[str] = None,
        start_published_date: Optional[str] = None,
        end_published_date: Optional[str] = None,
        use_autoprompt: Optional[bool] = None,
    ) -> List[Dict]:
        """Run query through Metaphor Search and return metadata.

        Args:
            query: The query to search for.
            num_results: The number of results to return.
            include_domains: A list of domains to include in the search. Only one of include_domains and exclude_domains should be defined.
            exclude_domains: A list of domains to exclude from the search. Only one of include_domains and exclude_domains should be defined.
            start_crawl_date: If specified, only pages we crawled after start_crawl_date will be returned.
            end_crawl_date: If specified, only pages we crawled before end_crawl_date will be returned.
            start_published_date: If specified, only pages published after start_published_date will be returned.
            end_published_date: If specified, only pages published before end_published_date will be returned.
            use_autoprompt: If true, we turn your query into a more Metaphor-friendly query. Adds latency.

        Returns:
            A list of dictionaries with the following keys:
                title - The title of the page
                url - The url
                author - Author of the content, if applicable. Otherwise, None.
                published_date - Estimated date published
                    in YYYY-MM-DD format. Otherwise, None.
        """  # noqa: E501
        raw_search_results = self._metaphor_search_results(
            query,
            num_results=num_results,
            include_domains=include_domains,
            exclude_domains=exclude_domains,
            start_crawl_date=start_crawl_date,
            end_crawl_date=end_crawl_date,
            start_published_date=start_published_date,
            end_published_date=end_published_date,
            use_autoprompt=use_autoprompt,
        )
        return self._clean_results(raw_search_results)

    async def results_async(
        self,
        query: str,
        num_results: int,
        include_domains: Optional[List[str]] = None,
        exclude_domains: Optional[List[str]] = None,
        start_crawl_date: Optional[str] = None,
        end_crawl_date: Optional[str] = None,
        start_published_date: Optional[str] = None,
        end_published_date: Optional[str] = None,
        use_autoprompt: Optional[bool] = None,
    ) -> List[Dict]:
        """Get results from the Metaphor Search API asynchronously."""

        # Function to perform the API call
        async def fetch() -> str:
            headers = {"X-Api-Key": self.metaphor_api_key}
            params = {
                "numResults": num_results,
                "query": query,
                "includeDomains": include_domains,
                "excludeDomains": exclude_domains,
                "startCrawlDate": start_crawl_date,
                "endCrawlDate": end_crawl_date,
                "startPublishedDate": start_published_date,
                "endPublishedDate": end_published_date,
                "useAutoprompt": use_autoprompt,
            }
            async with aiohttp.ClientSession() as session:
                async with session.post(
                    f"{METAPHOR_API_URL}/search", json=params, headers=headers
                ) as res:
                    if res.status == 200:
                        data = await res.text()
                        return data
                    else:
                        raise Exception(f"Error {res.status}: {res.reason}")

        results_json_str = await fetch()
        results_json = json.loads(results_json_str)
        return self._clean_results(results_json["results"])

    def _clean_results(self, raw_search_results: List[Dict]) -> List[Dict]:
        cleaned_results = []
        for result in raw_search_results:
            cleaned_results.append(
                {
                    "title": result.get("title", "Unknown Title"),
                    "url": result.get("url", "Unknown URL"),
                    "author": result.get("author", "Unknown Author"),
                    "published_date": result.get("publishedDate", "Unknown Date"),
                }
            )
        return cleaned_results