# import nltk # import math # import torch # # from transformers import AutoModelForSequenceClassification, AutoTokenizer # # from transformers import AutoTokenizer, AutoModelForSequenceClassification # from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline # from nltk.tokenize import word_tokenize, sent_tokenize # from nltk.corpus import stopwords # from collections import Counter # from flair.data import Sentence # from flair.models import SequenceTagger # nltk.download('stopwords') # nltk.download('punkt') # import streamlit as st # st.set_page_config(layout="wide") # def divide_sentence(sentence): # conjunctions = ["and", "but", "or", "however", "therefore", "furthermore", "nevertheless",'the','i'] # tokens = nltk.word_tokenize(sentence) # subsentences = [] # current_subsentence = [] # for token in tokens: # if token.lower() in conjunctions: # if len(current_subsentence)>0: # subsentences.append(" ".join(current_subsentence)) # current_subsentence = [] # else: # current_subsentence.append(token) # # Add the final subsentence to the list # subsentences.append(" ".join(current_subsentence)) # # print(subsentences) # # d={} # # for s in subsentences: # # d[s] = {'accuracy':None,} # return subsentences # def topic_identify(subsentences): # def sigmoid(x): # return 1 / (1 + math.exp(-x)) # tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all") # model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all", problem_type="multi_label_classification") # model.eval() # class_mapping = model.config.id2label # topics = [] # for text in subsentences: # with torch.no_grad(): # tokens = tokenizer(text, return_tensors='pt') # output = model(**tokens) # flags = [sigmoid(s) > 0.5 for s in output[0][0].detach().tolist()] # topic = [class_mapping[n] for n, i in enumerate(flags) if i] # topics.append(','.join(topic)) # return topics # def sentiment_score(subsentences): # tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") # model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") # from transformers import pipeline # sentiment_task = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) # senti = [] # for sen in subsentences: # a=sentiment_task(sen) # # [{'label': 'positive', 'score': 0.9484752416610718}] # a=a[0] # senti.append(a['label']+' , '+str(a['score'])) # return senti # def intent_identify(subsentences): # model_name = 'cartesinus/fedcsis-intent_baseline-xlm_r-en' # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = AutoModelForSequenceClassification.from_pretrained(model_name) # classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer) # intents = [] # for s in subsentences: # res = classifier(s) # a=res[0] # intents.append(a['label']+' , '+str(a['score'])) # return intents # def entity_identify(subsentences): # # load the NER tagger # tagger = SequenceTagger.load('ner') # # create a sentence to analyze # entities = [] # for sentence in subsentences: # sentence = Sentence(sentence) # # run NER on the sentence # tagger.predict(sentence) # # print the entities found in the sentence # ent = [] # for entity in sentence.get_spans('ner'): # ent.append(entity.text) # entities.append(','.join(ent)) # return entities # def keyword_identify(subsentences): # class KeywordExtractor: # def __init__(self): # self.stop_words = set(stopwords.words('english')) # def extract_keywords(self, text): # # tokenize sentences # sentences = sent_tokenize(text) # # tokenize words and remove stop words # words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.lower() not in self.stop_words and word.isalpha()] # # count word frequencies # word_freq = Counter(words) # # sort words by frequency # sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) # # return top 3 keywords # return [word[0] for word in sorted_words[:2]] # key = KeywordExtractor() # keywords=[] # for s in subsentences: # keyword = key.extract_keywords(s) # keywords.append(','.join(keyword)) # return keywords # st.markdown("