|
import json |
|
import jieba |
|
import re |
|
import requests |
|
import backoff |
|
import time |
|
|
|
|
|
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException) |
|
def post_url(url, headers, payload): |
|
time.sleep(1) |
|
response = requests.request("POST", url, headers=headers, data=payload) |
|
return response |
|
|
|
|
|
def seg(text): |
|
text = text.replace('\n', " ") |
|
sentences = re.split(r'(?<=[。!?.!?:])\s*', text) |
|
sentences = [string for string in sentences if string != ''] |
|
return sentences |
|
|
|
|
|
def clean_text(text): |
|
text = text.replace('\n', "") |
|
text = re.sub(r"-", " ", text) |
|
text = re.sub(r"\d+/\d+/\d+", "", text) |
|
text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) |
|
text = re.sub( |
|
r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) |
|
pure_text = '' |
|
for letter in text: |
|
if letter.isalpha() or letter == ' ': |
|
pure_text += letter |
|
|
|
text = ' '.join(word for word in pure_text.split() if len(word) > 1) |
|
return text |
|
|
|
|
|
def article_to_group(groups, topics): |
|
para = {} |
|
for i in groups: |
|
if not i[1] in para: |
|
para[i[1]] = i[0] |
|
else: |
|
para[i[1]] = para[i[1]] + i[0] |
|
return para |
|
|
|
|
|
def generation(para, max_length): |
|
API_KEY = "IZt1uK9PAI0LiqleqT0cE30b" |
|
SECRET_KEY = "Xv5kHB8eyhNuI1B1G7fRgm2SIPdlxGxs" |
|
|
|
def get_access_token(): |
|
|
|
url = "https://aip.baidubce.com/oauth/2.0/token" |
|
params = {"grant_type": "client_credentials", |
|
"client_id": API_KEY, "client_secret": SECRET_KEY} |
|
return str(requests.post(url, params=params).json().get("access_token")) |
|
|
|
url = "https://aip.baidubce.com/rpc/2.0/nlp/v1/news_summary?charset=UTF-8&access_token=" + get_access_token() |
|
topic = {} |
|
Ai_abstract = [] |
|
for i, (j, k) in enumerate(para.items()): |
|
input_text = k |
|
|
|
payload = json.dumps({ |
|
"content": k, |
|
"max_summary_len": max_length |
|
}) |
|
headers = { |
|
'Content-Type': 'application/json', |
|
'Accept': 'application/json' |
|
} |
|
|
|
response = post_url(url, headers, payload) |
|
text_dict = json.loads(response.text) |
|
|
|
while('summary' not in text_dict.keys()): |
|
response = post_url(url, headers, payload) |
|
text_dict = json.loads(response.text) |
|
print("ReTrying") |
|
|
|
topic[text_dict['summary']] = (j, k) |
|
Ai_abstract.append(text_dict['summary']) |
|
return topic,Ai_abstract |
|
def formate_text(title_dict,outline_list): |
|
formated = [] |
|
for each in outline_list: |
|
if(each not in title_dict.keys()): |
|
formated.append(f"# {each}") |
|
if(each in title_dict.keys()): |
|
formated.append(f"## {each}") |
|
formated.append(title_dict[each][1]) |
|
return formated |