Spaces:
Sleeping
Sleeping
import httpx | |
from langchain.document_loaders import AsyncHtmlLoader | |
from langchain.document_transformers import BeautifulSoupTransformer | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from ..tools.tool import Tool | |
class WebBrowser(Tool): | |
description = '็ๆ่บๆฏๅญ็บน็ๅพ็' | |
name = 'web_browser' | |
parameters: list = [{ | |
'name': 'urls', | |
'description': 'the urls that the user wants to browse', | |
'required': True | |
}] | |
def __init__(self, cfg={}): | |
super().__init__(cfg) | |
self.split_url_into_chunk = self.cfg.get('split_url_into_chunk', False) | |
self.headers = { | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' | |
} | |
self.client = httpx.Client( | |
headers=self.headers, verify=False, timeout=30.0) | |
def _local_call(self, *args, **kwargs): | |
parsed_args, parsed_kwargs = self._local_parse_input(*args, **kwargs) | |
urls = parsed_kwargs['urls'] | |
print(urls) | |
if urls is None: | |
return {'result': ''} | |
# # load html | |
loader = AsyncHtmlLoader(urls) | |
docs = loader.load() | |
# Transform | |
bs_transformer = BeautifulSoupTransformer() | |
docs_transformed = bs_transformer.transform_documents( | |
docs, tags_to_extract=['span']) | |
# split url content into chunk in order to get fine-grained results | |
if self.split_url_into_chunk: | |
# Grab the first 1000 tokens of the site | |
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
chunk_size=1000, chunk_overlap=0) | |
splits = splitter.split_documents(docs_transformed) | |
else: | |
splits = docs_transformed | |
search_results = [] | |
for item in splits: | |
result = { | |
'url': item.metadata['source'], | |
'content': item.page_content | |
} | |
search_results.append(result) | |
return {'result': search_results} | |
def _local_parse_input(self, *args, **kwargs): | |
urls = kwargs.get('urls', []) | |
if isinstance(urls, str): | |
urls = [urls] | |
kwargs['urls'] = urls | |
return args, kwargs | |
if __name__ == '__main__': | |
tool = WebBrowser() | |
urls = ['https://blog.sina.com.cn/zhangwuchang'] | |
result = tool._local_call(urls=urls) | |
print(result) | |