Spaces:
Sleeping
Sleeping
import urllib.request | |
import urllib.parse | |
from langchain_core.output_parsers import StrOutputParser | |
import requests | |
import html2text | |
import urllib.request | |
from langchain_openai import ChatOpenAI | |
import os | |
from langchain_core.prompts import ChatPromptTemplate | |
from serpapi import GoogleSearch | |
os.environ["OPENAI_API_KEY"] = "sb-6a683cb3bd63a9b72040aa2dd08feff8b68f08a0e1d959f5" | |
os.environ['OPENAI_BASE_URL'] = "https://api.openai-sb.com/v1/" | |
os.environ["SERPAPI_API_KEY"] = "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" | |
llm=ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.6) | |
ext_template=""" | |
提供的Claim可能由多个子句构成。请从每个子句中选出最能决定新闻真实性的主语\ | |
仅提取那些对确认新闻真实性至关重要的词条,无关紧要的子句可以忽略\ | |
提取的词条要是名词\ | |
总共提取的关键词条数量不超过三个\ | |
以下面的格式返回提取到的词条\ | |
格式: | |
[词条1,词条2,词条3] | |
Claim:{question} | |
""" | |
ext_prompt = ChatPromptTemplate.from_template(ext_template) | |
syn_template=""" | |
给定的Claim是一个新闻的相关资料,你的任务是综合Claim的内容,删除重复部分,确保最终文本清晰明了、易于阅读\ | |
返回处理后的文本 | |
Claim:{information} | |
""" | |
syn_prompt=ChatPromptTemplate.from_template(syn_template) | |
syn_chain=syn_prompt | llm | StrOutputParser() | |
def extract(question): | |
ext_chain = ext_prompt | llm | StrOutputParser() | |
result=ext_chain.invoke({"question": question}) | |
return result | |
def get_linktext(url): | |
flag = False | |
html_content = '' | |
try: | |
response = requests.get(url) | |
html_content = response.text | |
except: | |
pass | |
if len(html_content) < 1: | |
try: | |
response = urllib.request.urlopen(url) | |
html_content = response.read().decode('utf-8') | |
except: | |
pass | |
try: | |
if len(html_content) > 0: | |
html_content = html2text.html2text(html_content) | |
except: | |
pass | |
html_content = html_content.strip() | |
if len(html_content) > 0: | |
flag = True | |
return flag, html_content | |
html_template=""" | |
给定的text是一个html页面的有关内容,你需要提取对item的有关简介、履历,尽量全面。特别是关于item的最新情况\ | |
综合返回提取的内容 | |
text:{text} \n item: {item} | |
""" | |
html_prompt = ChatPromptTemplate.from_template(html_template) | |
html_chain=html_prompt | llm | StrOutputParser() | |
def baidu(question): | |
information="" | |
result=extract(question) | |
res = result.strip('[') | |
res = res.strip(']') | |
res = res.replace(' ', '') | |
res = res.split(',') | |
for keyword in res: | |
url='https://baike.baidu.com/item/'+keyword | |
flag,text=get_linktext(url) | |
if(flag): | |
result=html_chain.invoke({"text": text[0:6000],"item":keyword}) | |
information=information+keyword+":"+result+'\n' | |
information=syn_chain.invoke({"information": information}) | |
return information | |
def search_bing(query): | |
params_bing = { | |
"engine": "bing_news", | |
"q": query, | |
"cc": "us", | |
"api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" | |
} | |
search = GoogleSearch(params_bing) | |
results = search.get_dict() | |
organic_results = results.get("organic_results", []) | |
if not organic_results: | |
return "" | |
final_output = "\n\n".join([ | |
f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}" | |
for news in organic_results | |
]) | |
return final_output | |
def search_baidu(query): | |
params_baidu = { | |
"engine": "baidu_news", | |
"q": query, | |
"ct": "1", | |
"api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" | |
} | |
search = GoogleSearch(params_baidu) | |
results = search.get_dict() | |
organic_results = results.get("organic_results", []) | |
if not organic_results: | |
return "" | |
final_output = "\n\n".join([ | |
f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}" | |
for news in organic_results | |
]) | |
return final_output | |
def search_google(query): | |
params_google = { | |
"engine": "google_news", | |
"q": query, | |
"api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" | |
} | |
search = GoogleSearch(params_google) | |
results = search.get_dict() | |
organic_results = results.get("organic_results", []) | |
if not organic_results: | |
return "" | |
final_output = "\n\n".join([ | |
f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}" | |
for news in organic_results | |
]) | |
return final_output |