MisDetectorV1 / search.py
You-shen's picture
Update search.py
fabec68 verified
import urllib.request
import urllib.parse
from langchain_core.output_parsers import StrOutputParser
import requests
import html2text
import urllib.request
from langchain_openai import ChatOpenAI
import os
from langchain_core.prompts import ChatPromptTemplate
from serpapi import GoogleSearch
os.environ["OPENAI_API_KEY"] = "sb-6a683cb3bd63a9b72040aa2dd08feff8b68f08a0e1d959f5"
os.environ['OPENAI_BASE_URL'] = "https://api.openai-sb.com/v1/"
os.environ["SERPAPI_API_KEY"] = "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
llm=ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.6)
ext_template="""
提供的Claim可能由多个子句构成。请从每个子句中选出最能决定新闻真实性的主语\
仅提取那些对确认新闻真实性至关重要的词条,无关紧要的子句可以忽略\
提取的词条要是名词\
总共提取的关键词条数量不超过三个\
以下面的格式返回提取到的词条\
格式:
[词条1,词条2,词条3]
Claim:{question}
"""
ext_prompt = ChatPromptTemplate.from_template(ext_template)
syn_template="""
给定的Claim是一个新闻的相关资料,你的任务是综合Claim的内容,删除重复部分,确保最终文本清晰明了、易于阅读\
返回处理后的文本
Claim:{information}
"""
syn_prompt=ChatPromptTemplate.from_template(syn_template)
syn_chain=syn_prompt | llm | StrOutputParser()
def extract(question):
ext_chain = ext_prompt | llm | StrOutputParser()
result=ext_chain.invoke({"question": question})
return result
def get_linktext(url):
flag = False
html_content = ''
try:
response = requests.get(url)
html_content = response.text
except:
pass
if len(html_content) < 1:
try:
response = urllib.request.urlopen(url)
html_content = response.read().decode('utf-8')
except:
pass
try:
if len(html_content) > 0:
html_content = html2text.html2text(html_content)
except:
pass
html_content = html_content.strip()
if len(html_content) > 0:
flag = True
return flag, html_content
html_template="""
给定的text是一个html页面的有关内容,你需要提取对item的有关简介、履历,尽量全面。特别是关于item的最新情况\
综合返回提取的内容
text:{text} \n item: {item}
"""
html_prompt = ChatPromptTemplate.from_template(html_template)
html_chain=html_prompt | llm | StrOutputParser()
def baidu(question):
information=""
result=extract(question)
res = result.strip('[')
res = res.strip(']')
res = res.replace(' ', '')
res = res.split(',')
for keyword in res:
url='https://baike.baidu.com/item/'+keyword
flag,text=get_linktext(url)
if(flag):
result=html_chain.invoke({"text": text[0:6000],"item":keyword})
information=information+keyword+":"+result+'\n'
information=syn_chain.invoke({"information": information})
return information
def search_bing(query):
params_bing = {
"engine": "bing_news",
"q": query,
"cc": "us",
"api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
}
search = GoogleSearch(params_bing)
results = search.get_dict()
organic_results = results.get("organic_results", [])
if not organic_results:
return ""
final_output = "\n\n".join([
f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
for news in organic_results
])
return final_output
def search_baidu(query):
params_baidu = {
"engine": "baidu_news",
"q": query,
"ct": "1",
"api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
}
search = GoogleSearch(params_baidu)
results = search.get_dict()
organic_results = results.get("organic_results", [])
if not organic_results:
return ""
final_output = "\n\n".join([
f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
for news in organic_results
])
return final_output
def search_google(query):
params_google = {
"engine": "google_news",
"q": query,
"api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
}
search = GoogleSearch(params_google)
results = search.get_dict()
organic_results = results.get("organic_results", [])
if not organic_results:
return ""
final_output = "\n\n".join([
f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
for news in organic_results
])
return final_output