File size: 4,445 Bytes
bd9abb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1fe6ba
 
 
bd9abb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import torch
from transformers import (AutoModelForSequenceClassification, AutoModelForSeq2SeqLM,
                          AutoConfig, AutoModelForTokenClassification,
                          AutoTokenizer, pipeline)
from peft import PeftModel, PeftConfig




def load_sentiment_analyzer():
    tokenizer = AutoTokenizer.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2")
    model = AutoModelForSequenceClassification.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2")

    return tokenizer, model

def load_summarizer():
    config = PeftConfig.from_pretrained("marcelomoreno26/bart-large-samsum-adapter")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
    tokenizer.pad_token = tokenizer.eos_token
    model = PeftModel.from_pretrained(model, "marcelomoreno26/bart-large-samsum-adapter", config=config)
    model = model.merge_and_unload()

    return tokenizer, model

def load_NER():
    config = AutoConfig.from_pretrained("hannahisrael03/wikineural-multilingual-ner-finetuned-wikiann")
    model = AutoModelForTokenClassification.from_pretrained("hannahisrael03/wikineural-multilingual-ner-finetuned-wikiann",config=config)
    tokenizer = AutoTokenizer.from_pretrained("hannahisrael03/wikineural-multilingual-ner-finetuned-wikiann")
    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average")

    return pipe

def get_sentiment_analysis(text, tokenizer, model):
    inputs = tokenizer(text, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Get predicted probabilities and predicted label
    probabilities = torch.softmax(outputs.logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1)
    # Convert the predicted label tensor to a Python integer
    predicted_label = predicted_label.item()
    # Map predicted label index to sentiment label
    label_dic = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
    # Print the predicted sentiment label
    return label_dic[predicted_label]


def generate_summary(text, tokenizer, model):
    prefix = "summarize: "
    encoded_input = tokenizer.encode_plus(prefix + text, return_tensors='pt', add_special_tokens=True)
    input_ids = encoded_input['input_ids']

    # Check if input_ids exceed the model's max length
    max_length = 512
    if input_ids.shape[1] > max_length:
        # Split the input_ids into manageable segments
        total_summary = []
        for i in range(0, input_ids.shape[1], max_length - 50):  # We use max_length - 50 to allow for some room for the model to generate context
            segment_ids = input_ids[:, i:i + max_length]
            output_ids = model.generate(segment_ids, max_length=150, num_beams=5, early_stopping=True)
            segment_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            total_summary.append(segment_summary)

        # Concatenate all segment summaries
        summary = ' '.join(total_summary)
    else:
        # Process as usual
        output_ids = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return summary


def get_NER(text, pipe):
    # Use pipeline to predict NER
    results = pipe(text)
    # Filter duplicates while retaining the highest score for each entity type and word combination
    unique_entities = {}
    for ent in results:
        key = (ent['entity_group'], ent['word'])
        if key not in unique_entities or unique_entities[key]['score'] < ent['score']:
            unique_entities[key] = ent

    # Prepare the output, sorted by the start position to maintain the order they appear in the text
    filtered_results = sorted(unique_entities.values(), key=lambda x: x['start'])
    # Format the results for a table display
    formatted_results = [[ent['word'], ent['entity_group']] for ent in filtered_results]
    filtered_results = []
    for entity in formatted_results:
        if entity[1] == 'ORG':
            # Split the 'word' by spaces and count the number of words
            if len(entity[0].split()) <= 2:
                filtered_results.append(entity)
        else:
            # Add non-ORG entities without filtering
            filtered_results.append(entity)

    return filtered_results