TSA / util.py
QINGCHE's picture
add outline and BERTinference
fdffdf0
raw
history blame
2.61 kB
import json
import jieba
import re
import requests
import backoff
import time
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def post_url(url, headers, payload):
time.sleep(0.3)
response = requests.request("POST", url, headers=headers, data=payload)
return response
def seg(text):
sentences = re.split(r'(?<=[。!?])\s*', text)
return sentences
def clean_text(text):
text = text.replace('\n', " ")
text = re.sub(r"-", " ", text)
text = re.sub(r"\d+/\d+/\d+", "", text) # 日期
text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) # 时间
text = re.sub(
r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) # 网址
pure_text = ''
for letter in text:
if letter.isalpha() or letter == ' ':
pure_text += letter
text = ' '.join(word for word in pure_text.split() if len(word) > 1)
return text
def article_to_group(groups, topics):
para = {}
for i in groups:
if not i[1] in para:
para[i[1]] = i[0]
else:
para[i[1]] = para[i[1]] + i[0]
return para
def generation(para, max_length):
API_KEY = "IZt1uK9PAI0LiqleqT0cE30b"
SECRET_KEY = "Xv5kHB8eyhNuI1B1G7fRgm2SIPdlxGxs"
def get_access_token():
url = "https://aip.baidubce.com/oauth/2.0/token"
params = {"grant_type": "client_credentials",
"client_id": API_KEY, "client_secret": SECRET_KEY}
return str(requests.post(url, params=params).json().get("access_token"))
url = "https://aip.baidubce.com/rpc/2.0/nlp/v1/news_summary?charset=UTF-8&access_token=" + get_access_token()
topic = {}
Ai_abstract = []
for i, (j, k) in enumerate(para.items()):
input_text = k
# print(k)
payload = json.dumps({
"content": k,
"max_summary_len": max_length
})
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'
}
response = post_url(url, headers, payload)
text_dict = json.loads(response.text)
# print(text_dict)
topic[text_dict['summary']] = (j, k)
Ai_abstract.append(text_dict['summary'])
return topic,Ai_abstract
def formate_text(title_dict,outline_list):
formated = []
for each in outline_list:
if(each not in title_dict.keys()):
formated.append(f"# {each}")
if(each in title_dict.keys()):
formated.append(f"## {each}")
formated.append(title_dict[each][1])
return formated