File size: 2,968 Bytes
e350168
 
 
 
 
fdffdf0
e350168
 
 
 
8ba144e
e350168
 
 
 
 
8ba144e
02d932f
8ba144e
e350168
 
 
 
8ba144e
e350168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdffdf0
e350168
 
 
 
 
 
 
 
 
 
 
 
02d932f
 
e350168
02d932f
 
 
 
 
 
 
 
 
fdffdf0
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import json
import jieba
import re
import requests
import backoff
import time


@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def post_url(url, headers, payload):
    time.sleep(1)
    response = requests.request("POST", url, headers=headers, data=payload)
    return response


def seg(text):
    text = text.replace('\n', " ")
    sentences = re.split(r'(?<=[。!?.!?: ])\s*', text)
    sentences  = [string for string in sentences if string != '']
    return sentences


def clean_text(text):
    text = text.replace('\n', "")
    text = re.sub(r"-", " ", text)
    text = re.sub(r"\d+/\d+/\d+", "", text)  # 日期
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)  # 时间
    text = re.sub(
        r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)  # 网址
    pure_text = ''
    for letter in text:
        if letter.isalpha() or letter == ' ':
            pure_text += letter

    text = ' '.join(word for word in pure_text.split() if len(word) > 1)
    return text


def article_to_group(groups, topics):
    para = {}
    for i in groups:
        if not i[1] in para:
            para[i[1]] = i[0]
        else:
            para[i[1]] = para[i[1]] + i[0]
    return para


def generation(para, max_length):
    API_KEY = "IZt1uK9PAI0LiqleqT0cE30b"
    SECRET_KEY = "Xv5kHB8eyhNuI1B1G7fRgm2SIPdlxGxs"

    def get_access_token():

        url = "https://aip.baidubce.com/oauth/2.0/token"
        params = {"grant_type": "client_credentials",
                  "client_id": API_KEY, "client_secret": SECRET_KEY}
        return str(requests.post(url, params=params).json().get("access_token"))

    url = "https://aip.baidubce.com/rpc/2.0/nlp/v1/news_summary?charset=UTF-8&access_token=" + get_access_token()
    topic = {}
    Ai_abstract = []
    for i, (j, k) in enumerate(para.items()):
        input_text = k
        # print(k)
        payload = json.dumps({
            "content": k,
            "max_summary_len": max_length
        })
        headers = {
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }

        # response = post_url(url, headers, payload)
        # text_dict = json.loads(response.text)
        # print(text_dict)
        # while('summary' not in text_dict.keys()):
        #     response = post_url(url, headers, payload)
        #     text_dict = json.loads(response.text)
        #     print("ReTrying")

        # topic[text_dict['summary']] = (j, k)
        # Ai_abstract.append(text_dict['summary'])
        topic[j] = (j, k)		
        Ai_abstract.append(j)
    return topic,Ai_abstract
def formate_text(title_dict,outline_list):
    formated = []
    for each in outline_list:
        if(each not in title_dict.keys()):
            formated.append(f"# {each}")
        if(each in title_dict.keys()):
            formated.append(f"## {each}")
            formated.append(title_dict[each][1])
    return formated