import requests import html2text from readability import Document from langchain.agents import Tool from urllib.parse import urlparse, parse_qs, urlunparse from langchain.text_splitter import RecursiveCharacterTextSplitter def fetch_page(url, model_name='gpt-3.5-turbo', timeout_sec=10): """Tool to fetch the content of a web page from a given URL. - This returns `title`, `content`, and `has_next` indicator. `content` is returned in markdown format. - By default, only up to 2,000 tokens of content are retrieved. - If there is more content available on the page, the `has_next` value will be True. - To read the continuation, you can increment the `page` parameter with the same URL and input them again. Returns ------- Dict[str, Any]: - status: str - page_content - title: str - content: str - has_next: bool """ # page parameter parsed_url = urlparse(url) parsed_qs = parse_qs(parsed_url.query) page = int(parsed_qs.get("page", [1])[0]) - 1 url = urlunparse( (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "") ) try: response = requests.get(url, timeout=timeout_sec) response.encoding = 'utf-8' except requests.exceptions.Timeout: return { "status": 500, "page_content": {'error_message': 'Could not download page due to Timeout Error. Please try to fetch other pages.'} } if response.status_code != 200: return { "status": response.status_code, "page_content": {'error_message': 'Could not download page. Please try to fetch other pages.'} } try: doc = Document(response.text) title = doc.title() html_content = doc.summary() content = html2text.html2text(html_content) except: return { "status": 500, "page_content": {'error_message': 'Could not parse page. Please try to fetch other pages.'} } text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( model_name=model_name, chunk_size=1000, chunk_overlap=0, ) chunks = text_splitter.split_text(content) if page >= len(chunks): return { "status": 500, "page_content": {'error_message': 'page parameter looks invalid. Please try to fetch other pages.'} } else: return { "status": 200, "page_content": { "title": title, "content": chunks[page], "has_next": page < len(chunks) - 1 } } def get_fetch_page_tool(): fetch_page_tool_description = """ Tool to fetch the content of a web page from a given URL. This returns `status` and `page_content` (`title`, `content` and `has_next` indicator). If status is not 200, there was some error of fetching page. (Try fetch other pages.) If a status code other than 200 is returned, please don't give up and make sure to check other pages. By default, only up to 2,000 tokens of content are retrieved. If there is more content available on the page, the `has_next` value will be True. To read the continuation, you can increment the `page` parameter with the same URL and input them again. (paging is start with 1, so next page is 2) e.g. https://www.obamalibrary.gov/obamas/president-barack-obama?page=2 """ return Tool( name='fetch_page', func=fetch_page, description=fetch_page_tool_description )