File size: 4,969 Bytes
c6c2224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fabec68
c6c2224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import urllib.request
import urllib.parse
from langchain_core.output_parsers import StrOutputParser
import requests
import html2text
import urllib.request
from langchain_openai import ChatOpenAI
import os
from langchain_core.prompts import ChatPromptTemplate
from serpapi import GoogleSearch

os.environ["OPENAI_API_KEY"] = "sb-6a683cb3bd63a9b72040aa2dd08feff8b68f08a0e1d959f5"
os.environ['OPENAI_BASE_URL'] = "https://api.openai-sb.com/v1/"
os.environ["SERPAPI_API_KEY"] = "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"

llm=ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.6)

ext_template="""
提供的Claim可能由多个子句构成。请从每个子句中选出最能决定新闻真实性的主语\
仅提取那些对确认新闻真实性至关重要的词条,无关紧要的子句可以忽略\
提取的词条要是名词\
总共提取的关键词条数量不超过三个\
以下面的格式返回提取到的词条\
格式:
[词条1,词条2,词条3]
Claim:{question}
"""
ext_prompt = ChatPromptTemplate.from_template(ext_template)

syn_template="""
给定的Claim是一个新闻的相关资料,你的任务是综合Claim的内容,删除重复部分,确保最终文本清晰明了、易于阅读\
返回处理后的文本
Claim:{information}
"""

syn_prompt=ChatPromptTemplate.from_template(syn_template)
syn_chain=syn_prompt | llm | StrOutputParser()
def extract(question):
  ext_chain = ext_prompt | llm | StrOutputParser()
  result=ext_chain.invoke({"question": question})
  return result
def get_linktext(url):
    flag = False
    html_content = ''
    try:
        response = requests.get(url)
        html_content = response.text
    except:
        pass
    if len(html_content) < 1:
        try:
            response = urllib.request.urlopen(url)
            html_content = response.read().decode('utf-8')
        except:
            pass
    try:
        if len(html_content) > 0:
            html_content = html2text.html2text(html_content)
    except:
        pass
    html_content = html_content.strip()
    if len(html_content) > 0:
        flag = True
    return flag, html_content
html_template="""
给定的text是一个html页面的有关内容,你需要提取对item的有关简介、履历,尽量全面。特别是关于item的最新情况\
综合返回提取的内容
text:{text} \n item: {item}
"""
html_prompt = ChatPromptTemplate.from_template(html_template)
html_chain=html_prompt | llm | StrOutputParser()

def baidu(question):
    information=""
    result=extract(question)
    res = result.strip('[')
    res = res.strip(']')
    res = res.replace(' ', '')
    res = res.split(',')
    for keyword in res:
        url='https://baike.baidu.com/item/'+keyword
        flag,text=get_linktext(url)
        if(flag):
            result=html_chain.invoke({"text": text[0:6000],"item":keyword})
            information=information+keyword+":"+result+'\n'
    information=syn_chain.invoke({"information": information})
    return information
def search_bing(query):
    params_bing = {
        "engine": "bing_news",
        "q": query,
        "cc": "us",
        "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
    }
    search = GoogleSearch(params_bing)
    results = search.get_dict()
    organic_results = results.get("organic_results", [])
    if not organic_results:
        return ""
    final_output = "\n\n".join([
        f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
        for news in organic_results
    ])
    return final_output

def search_baidu(query):
    params_baidu = {
        "engine": "baidu_news",
        "q": query,
        "ct": "1",
        "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
    }
    search = GoogleSearch(params_baidu)
    results = search.get_dict()
    organic_results = results.get("organic_results", [])
    if not organic_results:
        return ""
    final_output = "\n\n".join([
        f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
        for news in organic_results
    ])
    return final_output

def search_google(query):
    params_google = {
        "engine": "google_news",
        "q": query,
        "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
    }
    search = GoogleSearch(params_google)
    results = search.get_dict()
    organic_results = results.get("organic_results", [])
    if not organic_results:
        return ""
    final_output = "\n\n".join([
        f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
        for news in organic_results
    ])
    return final_output