File size: 5,153 Bytes
50dfccc
 
ea031ab
50dfccc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f31fb9
 
 
 
 
 
 
 
 
 
50dfccc
 
 
 
 
 
 
 
 
 
 
 
 
 
0666fec
50dfccc
 
 
0666fec
50dfccc
 
 
 
0666fec
50dfccc
 
 
 
6595ab8
50dfccc
 
 
 
 
0666fec
50dfccc
 
 
 
 
6595ab8
 
a097b6c
d1efbd2
a097b6c
 
50dfccc
a097b6c
 
 
 
d1efbd2
a097b6c
50dfccc
a097b6c
 
50dfccc
676fe40
 
50dfccc
929c0af
50dfccc
 
929c0af
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from toolbox import CatchException, report_execption, write_results_to_file
from toolbox import update_ui

def get_meta_information(url, chatbot, history):
    import requests
    import arxiv
    import difflib
    from bs4 import BeautifulSoup
    from toolbox import get_conf
    proxies, = get_conf('proxies')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
    }
    # 发送 GET 请求
    response = requests.get(url, proxies=proxies, headers=headers)

    # 解析网页内容
    soup = BeautifulSoup(response.text, "html.parser")

    def string_similar(s1, s2):
        return difflib.SequenceMatcher(None, s1, s2).quick_ratio()

    profile = []
    # 获取所有文章的标题和作者
    for result in soup.select(".gs_ri"):
        title = result.a.text.replace('\n', ' ').replace('  ', ' ')
        author = result.select_one(".gs_a").text
        try:
            citation = result.select_one(".gs_fl > a[href*='cites']").text  # 引用次数是链接中的文本,直接取出来
        except:
            citation = 'cited by 0'
        abstract = result.select_one(".gs_rs").text.strip()  # 摘要在 .gs_rs 中的文本,需要清除首尾空格
        search = arxiv.Search(
            query = title,
            max_results = 1,
            sort_by = arxiv.SortCriterion.Relevance,
        )
        try:
            paper = next(search.results())
            if string_similar(title, paper.title) > 0.90: # same paper
                abstract = paper.summary.replace('\n', ' ')
                is_paper_in_arxiv = True
            else:   # different paper
                abstract = abstract
                is_paper_in_arxiv = False
            paper = next(search.results())
        except:
            abstract = abstract
            is_paper_in_arxiv = False
        print(title)
        print(author)
        print(citation)
        profile.append({
            'title':title,
            'author':author,
            'citation':citation,
            'abstract':abstract,
            'is_paper_in_arxiv':is_paper_in_arxiv,
        })

        chatbot[-1] = [chatbot[-1][0], title + f'\n\n是否在arxiv中(不在arxiv中无法获取完整摘要):{is_paper_in_arxiv}\n\n' + abstract]
        yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
    return profile

@CatchException
def 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
    # 基本信息:功能、贡献者
    chatbot.append([
        "函数插件功能?",
        "分析用户提供的谷歌学术(google scholar)搜索页面中,出现的所有文章: binary-husky,插件初始化中..."])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

    # 尝试导入依赖,如果缺少依赖,则给出安装建议
    try:
        import arxiv
        import math
        from bs4 import BeautifulSoup
    except:
        report_execption(chatbot, history, 
            a = f"解析项目: {txt}", 
            b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade beautifulsoup4 arxiv```。")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return

    # 清空历史,以免输入溢出
    history = []
    meta_paper_info_list = yield from get_meta_information(txt, chatbot, history)
    batchsize = 5
    for batch in range(math.ceil(len(meta_paper_info_list)/batchsize)):
        if len(meta_paper_info_list[:batchsize]) > 0:
            i_say = "下面是一些学术文献的数据,提取出以下内容:" + \
            "1、英文题目;2、中文题目翻译;3、作者;4、arxiv公开(is_paper_in_arxiv);4、引用数量(cite);5、中文摘要翻译。" + \
            f"以下是信息源:{str(meta_paper_info_list[:batchsize])}" 

            inputs_show_user = f"请分析此页面中出现的所有文章:{txt},这是第{batch+1}批"
            gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
                inputs=i_say, inputs_show_user=inputs_show_user,
                llm_kwargs=llm_kwargs, chatbot=chatbot, history=[],
                sys_prompt="你是一个学术翻译,请从数据中提取信息。你必须使用Markdown表格。你必须逐个文献进行处理。"
            )

            history.extend([ f"第{batch+1}批", gpt_say ])
            meta_paper_info_list = meta_paper_info_list[batchsize:]

    chatbot.append(["状态?", 
        "已经全部完成,您可以试试让AI写一个Related Works,例如您可以继续输入Write a \"Related Works\" section about \"你搜索的研究领域\" for me."])
    msg = '正常'
    yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
    res = write_results_to_file(history)
    chatbot.append(("完成了吗?", res)); 
    yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面