import torch from transformers import (AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoConfig, AutoModelForTokenClassification, AutoTokenizer, pipeline) from peft import PeftModel, PeftConfig def load_sentiment_analyzer(): tokenizer = AutoTokenizer.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2") model = AutoModelForSequenceClassification.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2") return tokenizer, model def load_summarizer(): config = PeftConfig.from_pretrained("marcelomoreno26/bart-large-samsum-adapter") model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large") tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") tokenizer.pad_token = tokenizer.eos_token model = PeftModel.from_pretrained(model, "marcelomoreno26/bart-large-samsum-adapter", config=config) model = model.merge_and_unload() return tokenizer, model def load_NER(): config = AutoConfig.from_pretrained("hannahisrael03/wikineural-multilingual-ner-finetuned-wikiann") model = AutoModelForTokenClassification.from_pretrained("hannahisrael03/wikineural-multilingual-ner-finetuned-wikiann",config=config) tokenizer = AutoTokenizer.from_pretrained("hannahisrael03/wikineural-multilingual-ner-finetuned-wikiann") pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average") return pipe def get_sentiment_analysis(text, tokenizer, model): inputs = tokenizer(text, padding=True, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) # Get predicted probabilities and predicted label probabilities = torch.softmax(outputs.logits, dim=1) predicted_label = torch.argmax(probabilities, dim=1) # Convert the predicted label tensor to a Python integer predicted_label = predicted_label.item() # Map predicted label index to sentiment label label_dic = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'} # Print the predicted sentiment label return label_dic[predicted_label] def generate_summary(text, tokenizer, model): prefix = "summarize: " encoded_input = tokenizer.encode_plus(prefix + text, return_tensors='pt', add_special_tokens=True) input_ids = encoded_input['input_ids'] # Check if input_ids exceed the model's max length max_length = 512 if input_ids.shape[1] > max_length: # Split the input_ids into manageable segments total_summary = [] for i in range(0, input_ids.shape[1], max_length - 50): # We use max_length - 50 to allow for some room for the model to generate context segment_ids = input_ids[:, i:i + max_length] output_ids = model.generate(segment_ids, max_length=150, num_beams=5, early_stopping=True) segment_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True) total_summary.append(segment_summary) # Concatenate all segment summaries summary = ' '.join(total_summary) else: # Process as usual output_ids = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True) summary = tokenizer.decode(output_ids[0], skip_special_tokens=True) return summary def get_NER(text, pipe): # Use pipeline to predict NER results = pipe(text) # Filter duplicates while retaining the highest score for each entity type and word combination unique_entities = {} for ent in results: key = (ent['entity_group'], ent['word']) if key not in unique_entities or unique_entities[key]['score'] < ent['score']: unique_entities[key] = ent # Prepare the output, sorted by the start position to maintain the order they appear in the text filtered_results = sorted(unique_entities.values(), key=lambda x: x['start']) # Format the results for a table display formatted_results = [[ent['word'], ent['entity_group']] for ent in filtered_results] filtered_results = [] for entity in formatted_results: if entity[1] == 'ORG': # Split the 'word' by spaces and count the number of words if len(entity[0].split()) <= 2: filtered_results.append(entity) else: # Add non-ORG entities without filtering filtered_results.append(entity) return filtered_results