import urllib.request import urllib.parse from langchain_core.output_parsers import StrOutputParser import requests import html2text import urllib.request from langchain_openai import ChatOpenAI import os from langchain_core.prompts import ChatPromptTemplate from serpapi import GoogleSearch os.environ["OPENAI_API_KEY"] = "sb-6a683cb3bd63a9b72040aa2dd08feff8b68f08a0e1d959f5" os.environ['OPENAI_BASE_URL'] = "https://api.openai-sb.com/v1/" os.environ["SERPAPI_API_KEY"] = "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" llm=ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.6) ext_template=""" 提供的Claim可能由多个子句构成。请从每个子句中选出最能决定新闻真实性的主语\ 仅提取那些对确认新闻真实性至关重要的词条,无关紧要的子句可以忽略\ 提取的词条要是名词\ 总共提取的关键词条数量不超过三个\ 以下面的格式返回提取到的词条\ 格式: [词条1,词条2,词条3] Claim:{question} """ ext_prompt = ChatPromptTemplate.from_template(ext_template) syn_template=""" 给定的Claim是一个新闻的相关资料,你的任务是综合Claim的内容,删除重复部分,确保最终文本清晰明了、易于阅读\ 返回处理后的文本 Claim:{information} """ syn_prompt=ChatPromptTemplate.from_template(syn_template) syn_chain=syn_prompt | llm | StrOutputParser() def extract(question): ext_chain = ext_prompt | llm | StrOutputParser() result=ext_chain.invoke({"question": question}) return result def get_linktext(url): flag = False html_content = '' try: response = requests.get(url) html_content = response.text except: pass if len(html_content) < 1: try: response = urllib.request.urlopen(url) html_content = response.read().decode('utf-8') except: pass try: if len(html_content) > 0: html_content = html2text.html2text(html_content) except: pass html_content = html_content.strip() if len(html_content) > 0: flag = True return flag, html_content html_template=""" 给定的text是一个html页面的有关内容,你需要提取对item的有关简介、履历,尽量全面。特别是关于item的最新情况\ 综合返回提取的内容 text:{text} \n item: {item} """ html_prompt = ChatPromptTemplate.from_template(html_template) html_chain=html_prompt | llm | StrOutputParser() def baidu(question): information="" result=extract(question) res = result.strip('[') res = res.strip(']') res = res.replace(' ', '') res = res.split(',') for keyword in res: url='https://baike.baidu.com/item/'+keyword flag,text=get_linktext(url) if(flag): result=html_chain.invoke({"text": text[0:6000],"item":keyword}) information=information+keyword+":"+result+'\n' information=syn_chain.invoke({"information": information}) return information def search_bing(query): params_bing = { "engine": "bing_news", "q": query, "cc": "us", "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" } search = GoogleSearch(params_bing) results = search.get_dict() organic_results = results.get("organic_results", []) if not organic_results: return "" final_output = "\n\n".join([ f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}" for news in organic_results ]) return final_output def search_baidu(query): params_baidu = { "engine": "baidu_news", "q": query, "ct": "1", "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" } search = GoogleSearch(params_baidu) results = search.get_dict() organic_results = results.get("organic_results", []) if not organic_results: return "" final_output = "\n\n".join([ f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}" for news in organic_results ]) return final_output def search_google(query): params_google = { "engine": "google_news", "q": query, "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" } search = GoogleSearch(params_google) results = search.get_dict() organic_results = results.get("organic_results", []) if not organic_results: return "" final_output = "\n\n".join([ f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}" for news in organic_results ]) return final_output