Spaces:
Sleeping
Sleeping
File size: 4,969 Bytes
c6c2224 fabec68 c6c2224 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import urllib.request
import urllib.parse
from langchain_core.output_parsers import StrOutputParser
import requests
import html2text
import urllib.request
from langchain_openai import ChatOpenAI
import os
from langchain_core.prompts import ChatPromptTemplate
from serpapi import GoogleSearch
os.environ["OPENAI_API_KEY"] = "sb-6a683cb3bd63a9b72040aa2dd08feff8b68f08a0e1d959f5"
os.environ['OPENAI_BASE_URL'] = "https://api.openai-sb.com/v1/"
os.environ["SERPAPI_API_KEY"] = "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
llm=ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.6)
ext_template="""
提供的Claim可能由多个子句构成。请从每个子句中选出最能决定新闻真实性的主语\
仅提取那些对确认新闻真实性至关重要的词条,无关紧要的子句可以忽略\
提取的词条要是名词\
总共提取的关键词条数量不超过三个\
以下面的格式返回提取到的词条\
格式:
[词条1,词条2,词条3]
Claim:{question}
"""
ext_prompt = ChatPromptTemplate.from_template(ext_template)
syn_template="""
给定的Claim是一个新闻的相关资料,你的任务是综合Claim的内容,删除重复部分,确保最终文本清晰明了、易于阅读\
返回处理后的文本
Claim:{information}
"""
syn_prompt=ChatPromptTemplate.from_template(syn_template)
syn_chain=syn_prompt | llm | StrOutputParser()
def extract(question):
ext_chain = ext_prompt | llm | StrOutputParser()
result=ext_chain.invoke({"question": question})
return result
def get_linktext(url):
flag = False
html_content = ''
try:
response = requests.get(url)
html_content = response.text
except:
pass
if len(html_content) < 1:
try:
response = urllib.request.urlopen(url)
html_content = response.read().decode('utf-8')
except:
pass
try:
if len(html_content) > 0:
html_content = html2text.html2text(html_content)
except:
pass
html_content = html_content.strip()
if len(html_content) > 0:
flag = True
return flag, html_content
html_template="""
给定的text是一个html页面的有关内容,你需要提取对item的有关简介、履历,尽量全面。特别是关于item的最新情况\
综合返回提取的内容
text:{text} \n item: {item}
"""
html_prompt = ChatPromptTemplate.from_template(html_template)
html_chain=html_prompt | llm | StrOutputParser()
def baidu(question):
information=""
result=extract(question)
res = result.strip('[')
res = res.strip(']')
res = res.replace(' ', '')
res = res.split(',')
for keyword in res:
url='https://baike.baidu.com/item/'+keyword
flag,text=get_linktext(url)
if(flag):
result=html_chain.invoke({"text": text[0:6000],"item":keyword})
information=information+keyword+":"+result+'\n'
information=syn_chain.invoke({"information": information})
return information
def search_bing(query):
params_bing = {
"engine": "bing_news",
"q": query,
"cc": "us",
"api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
}
search = GoogleSearch(params_bing)
results = search.get_dict()
organic_results = results.get("organic_results", [])
if not organic_results:
return ""
final_output = "\n\n".join([
f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
for news in organic_results
])
return final_output
def search_baidu(query):
params_baidu = {
"engine": "baidu_news",
"q": query,
"ct": "1",
"api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
}
search = GoogleSearch(params_baidu)
results = search.get_dict()
organic_results = results.get("organic_results", [])
if not organic_results:
return ""
final_output = "\n\n".join([
f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
for news in organic_results
])
return final_output
def search_google(query):
params_google = {
"engine": "google_news",
"q": query,
"api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
}
search = GoogleSearch(params_google)
results = search.get_dict()
organic_results = results.get("organic_results", [])
if not organic_results:
return ""
final_output = "\n\n".join([
f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
for news in organic_results
])
return final_output |