Spaces:
Sleeping
Sleeping
| import urllib.request | |
| import urllib.parse | |
| from langchain_core.output_parsers import StrOutputParser | |
| import requests | |
| import html2text | |
| import urllib.request | |
| from langchain_openai import ChatOpenAI | |
| import os | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from serpapi import GoogleSearch | |
| os.environ["OPENAI_API_KEY"] = "sb-6a683cb3bd63a9b72040aa2dd08feff8b68f08a0e1d959f5" | |
| os.environ['OPENAI_BASE_URL'] = "https://api.openai-sb.com/v1/" | |
| os.environ["SERPAPI_API_KEY"] = "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" | |
| llm=ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.6) | |
| ext_template=""" | |
| 提供的Claim可能由多个子句构成。请从每个子句中选出最能决定新闻真实性的主语\ | |
| 仅提取那些对确认新闻真实性至关重要的词条,无关紧要的子句可以忽略\ | |
| 提取的词条要是名词\ | |
| 总共提取的关键词条数量不超过三个\ | |
| 以下面的格式返回提取到的词条\ | |
| 格式: | |
| [词条1,词条2,词条3] | |
| Claim:{question} | |
| """ | |
| ext_prompt = ChatPromptTemplate.from_template(ext_template) | |
| syn_template=""" | |
| 给定的Claim是一个新闻的相关资料,你的任务是综合Claim的内容,删除重复部分,确保最终文本清晰明了、易于阅读\ | |
| 返回处理后的文本 | |
| Claim:{information} | |
| """ | |
| syn_prompt=ChatPromptTemplate.from_template(syn_template) | |
| syn_chain=syn_prompt | llm | StrOutputParser() | |
| def extract(question): | |
| ext_chain = ext_prompt | llm | StrOutputParser() | |
| result=ext_chain.invoke({"question": question}) | |
| return result | |
| def get_linktext(url): | |
| flag = False | |
| html_content = '' | |
| try: | |
| response = requests.get(url) | |
| html_content = response.text | |
| except: | |
| pass | |
| if len(html_content) < 1: | |
| try: | |
| response = urllib.request.urlopen(url) | |
| html_content = response.read().decode('utf-8') | |
| except: | |
| pass | |
| try: | |
| if len(html_content) > 0: | |
| html_content = html2text.html2text(html_content) | |
| except: | |
| pass | |
| html_content = html_content.strip() | |
| if len(html_content) > 0: | |
| flag = True | |
| return flag, html_content | |
| html_template=""" | |
| 给定的text是一个html页面的有关内容,你需要提取对item的有关简介、履历,尽量全面。特别是关于item的最新情况\ | |
| 综合返回提取的内容 | |
| text:{text} \n item: {item} | |
| """ | |
| html_prompt = ChatPromptTemplate.from_template(html_template) | |
| html_chain=html_prompt | llm | StrOutputParser() | |
| def baidu(question): | |
| information="" | |
| result=extract(question) | |
| res = result.strip('[') | |
| res = res.strip(']') | |
| res = res.replace(' ', '') | |
| res = res.split(',') | |
| for keyword in res: | |
| url='https://baike.baidu.com/item/'+keyword | |
| flag,text=get_linktext(url) | |
| if(flag): | |
| result=html_chain.invoke({"text": text[0:6000],"item":keyword}) | |
| information=information+keyword+":"+result+'\n' | |
| information=syn_chain.invoke({"information": information}) | |
| return information | |
| def search_bing(query): | |
| params_bing = { | |
| "engine": "bing_news", | |
| "q": query, | |
| "cc": "us", | |
| "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" | |
| } | |
| search = GoogleSearch(params_bing) | |
| results = search.get_dict() | |
| organic_results = results.get("organic_results", []) | |
| if not organic_results: | |
| return "" | |
| final_output = "\n\n".join([ | |
| f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}" | |
| for news in organic_results | |
| ]) | |
| return final_output | |
| def search_baidu(query): | |
| params_baidu = { | |
| "engine": "baidu_news", | |
| "q": query, | |
| "ct": "1", | |
| "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" | |
| } | |
| search = GoogleSearch(params_baidu) | |
| results = search.get_dict() | |
| organic_results = results.get("organic_results", []) | |
| if not organic_results: | |
| return "" | |
| final_output = "\n\n".join([ | |
| f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}" | |
| for news in organic_results | |
| ]) | |
| return final_output | |
| def search_google(query): | |
| params_google = { | |
| "engine": "google_news", | |
| "q": query, | |
| "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404" | |
| } | |
| search = GoogleSearch(params_google) | |
| results = search.get_dict() | |
| organic_results = results.get("organic_results", []) | |
| if not organic_results: | |
| return "" | |
| final_output = "\n\n".join([ | |
| f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}" | |
| for news in organic_results | |
| ]) | |
| return final_output |