File size: 2,084 Bytes
cdc5783
 
 
 
 
7019e7f
090f2d4
da7746d
cdc5783
7019e7f
 
cdc5783
a680719
 
cdc5783
 
 
 
4c74009
2c41d85
858ef45
1f6b7aa
2a78aa3
cdc5783
a680719
 
cdc5783
 
 
 
a680719
cdc5783
 
 
 
30e268a
60a3519
cdc5783
e2ec8e0
cdc5783
 
 
 
 
 
 
 
 
 
 
e2ec8e0
cdc5783
 
 
 
 
 
 
e2ec8e0
cdc5783
 
 
858ef45
cdc5783
 
 
3784e1c
 
7019e7f
e2ec8e0
cdc5783
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from langdetect import detect
from transformers import pipeline

from utils.tag_utils import filter_tags

AiSummaryVersion = 3
MinTagScore = 0.7
summarization_pipeline = pipeline("summarization", model="Falconsai/text_summarization")
en_translation_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
tag_gen_pipe_1 = pipeline("text-classification", model="dima806/news-category-classifier-distilbert")
tag_gen_pipe_2 = pipeline("text-classification", model="elozano/bert-base-cased-news-category")


def summarize(id: str, text: str):
    if text is None or len(text) < 10:
        return {
            "ver": AiSummaryVersion
        }
    summary = get_summarization(text) if len(text) > 3000 else text
    translated = get_en_translation(summary)
    tags = get_tags(translated, id)
    tags = filter_tags(tags)
    tags = sorted(list(set(tags)))

    value = {
        "id": id,
        "ver": AiSummaryVersion,
        "summary": summary,
        "tags": tags,
    }
    return value


def get_summarization(text: str):
    try:
        # Max / Min number of words
        result = summarization_pipeline(text, max_length=500, min_length=100, do_sample=False)
        return result[0]['summary_text'] if isinstance(result, list) else result['summary_text']
    except:
        return None


def get_en_translation(text: str):
    if text is None:
        return None
    try:
        if is_english(text):
            return text
        result = en_translation_pipe(text)
        return result[0]['translation_text'] if isinstance(result, list) else result['translation_text']
    except:
        return None


def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False


def get_tags(text: str, id: str):
    if text is None:
        return []
    try:
        tags1 = [tag['label'] for tag in tag_gen_pipe_1(text) if tag['score'] >= MinTagScore]
        tags2 = [tag['label'] for tag in tag_gen_pipe_2(text) if tag['score'] >= MinTagScore]
        return tags1 + tags2
    except:
        return []