Spaces:

You-shen
/

MisDetectorV1

Sleeping

File size: 4,969 Bytes

import urllib.request
import urllib.parse
from langchain_core.output_parsers import StrOutputParser
import requests
import html2text
import urllib.request
from langchain_openai import ChatOpenAI
import os
from langchain_core.prompts import ChatPromptTemplate
from serpapi import GoogleSearch

os.environ["OPENAI_API_KEY"] = "sb-6a683cb3bd63a9b72040aa2dd08feff8b68f08a0e1d959f5"
os.environ['OPENAI_BASE_URL'] = "https://api.openai-sb.com/v1/"
os.environ["SERPAPI_API_KEY"] = "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"

llm=ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.6)

ext_template="""
提供的Claim可能由多个子句构成。请从每个子句中选出最能决定新闻真实性的主语\
仅提取那些对确认新闻真实性至关重要的词条，无关紧要的子句可以忽略\
提取的词条要是名词\
总共提取的关键词条数量不超过三个\
以下面的格式返回提取到的词条\
格式：
[词条1,词条2,词条3]
Claim：{question}
"""
ext_prompt = ChatPromptTemplate.from_template(ext_template)

syn_template="""
给定的Claim是一个新闻的相关资料，你的任务是综合Claim的内容，删除重复部分，确保最终文本清晰明了、易于阅读\
返回处理后的文本
Claim：{information}
"""

syn_prompt=ChatPromptTemplate.from_template(syn_template)
syn_chain=syn_prompt | llm | StrOutputParser()
def extract(question):
  ext_chain = ext_prompt | llm | StrOutputParser()
  result=ext_chain.invoke({"question": question})
  return result
def get_linktext(url):
    flag = False
    html_content = ''
    try:
        response = requests.get(url)
        html_content = response.text
    except:
        pass
    if len(html_content) < 1:
        try:
            response = urllib.request.urlopen(url)
            html_content = response.read().decode('utf-8')
        except:
            pass
    try:
        if len(html_content) > 0:
            html_content = html2text.html2text(html_content)
    except:
        pass
    html_content = html_content.strip()
    if len(html_content) > 0:
        flag = True
    return flag, html_content
html_template="""
给定的text是一个html页面的有关内容，你需要提取对item的有关简介、履历,尽量全面。特别是关于item的最新情况\
综合返回提取的内容
text：{text} \n item: {item}
"""
html_prompt = ChatPromptTemplate.from_template(html_template)
html_chain=html_prompt | llm | StrOutputParser()

def baidu(question):
    information=""
    result=extract(question)
    res = result.strip('[')
    res = res.strip(']')
    res = res.replace(' ', '')
    res = res.split(',')
    for keyword in res:
        url='https://baike.baidu.com/item/'+keyword
        flag,text=get_linktext(url)
        if(flag):
            result=html_chain.invoke({"text": text[0:6000],"item":keyword})
            information=information+keyword+":"+result+'\n'
    information=syn_chain.invoke({"information": information})
    return information
def search_bing(query):
    params_bing = {
        "engine": "bing_news",
        "q": query,
        "cc": "us",
        "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
    }
    search = GoogleSearch(params_bing)
    results = search.get_dict()
    organic_results = results.get("organic_results", [])
    if not organic_results:
        return ""
    final_output = "\n\n".join([
        f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
        for news in organic_results
    ])
    return final_output

def search_baidu(query):
    params_baidu = {
        "engine": "baidu_news",
        "q": query,
        "ct": "1",
        "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
    }
    search = GoogleSearch(params_baidu)
    results = search.get_dict()
    organic_results = results.get("organic_results", [])
    if not organic_results:
        return ""
    final_output = "\n\n".join([
        f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
        for news in organic_results
    ])
    return final_output

def search_google(query):
    params_google = {
        "engine": "google_news",
        "q": query,
        "api_key": "dcc98b22d5f7d413979a175ff7d75b721c5992a3ee1e2363020b2bbdf4f82404"
    }
    search = GoogleSearch(params_google)
    results = search.get_dict()
    organic_results = results.get("organic_results", [])
    if not organic_results:
        return ""
    final_output = "\n\n".join([
        f"Title: {news.get('title', 'No Title')}\nSnippet: {news.get('snippet', 'No Snippet')}\nDate: {news.get('date', 'No Date')}\nSource:{news.get('source','No Source')}"
        for news in organic_results
    ])
    return final_output