Ambareesh T N commited on
Commit
c7baaec
1 Parent(s): c2e5ccb

Add application file

Browse files
Files changed (2) hide show
  1. app.py.py +161 -0
  2. requirements.txt +7 -0
app.py.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled3.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/18DTgeDomshKNQMgYQ6y6mJbBom9mRw5l
8
+ """
9
+
10
+ # Commented out IPython magic to ensure Python compatibility.
11
+ # %%writefile app.py
12
+ # %%writefile 'app.py'
13
+ import nltk
14
+ import math
15
+ import torch
16
+ # from transformers import AutoModelForSequenceClassification, AutoTokenizer
17
+ # from transformers import AutoTokenizer, AutoModelForSequenceClassification
18
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
19
+ from nltk.tokenize import word_tokenize, sent_tokenize
20
+ from nltk.corpus import stopwords
21
+ from collections import Counter
22
+ from flair.data import Sentence
23
+ from flair.models import SequenceTagger
24
+ nltk.download('stopwords')
25
+ nltk.download('punkt')
26
+ import streamlit as st
27
+
28
+ st.set_page_config(layout="wide")
29
+
30
+
31
+
32
+ def divide_sentence(sentence):
33
+ conjunctions = ["and", "but", "or", "however", "therefore", "furthermore", "nevertheless",'the','i']
34
+ tokens = nltk.word_tokenize(sentence)
35
+ subsentences = []
36
+ current_subsentence = []
37
+ for token in tokens:
38
+ if token.lower() in conjunctions:
39
+ if len(current_subsentence)>0:
40
+ subsentences.append(" ".join(current_subsentence))
41
+ current_subsentence = []
42
+ else:
43
+ current_subsentence.append(token)
44
+ # Add the final subsentence to the list
45
+ subsentences.append(" ".join(current_subsentence))
46
+ # print(subsentences)
47
+ # d={}
48
+ # for s in subsentences:
49
+ # d[s] = {'accuracy':None,}
50
+ return subsentences
51
+
52
+
53
+
54
+ def topic_identify(subsentences):
55
+ def sigmoid(x):
56
+ return 1 / (1 + math.exp(-x))
57
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all")
58
+ model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all", problem_type="multi_label_classification")
59
+ model.eval()
60
+ class_mapping = model.config.id2label
61
+ topics = []
62
+ for text in subsentences:
63
+ with torch.no_grad():
64
+ tokens = tokenizer(text, return_tensors='pt')
65
+ output = model(**tokens)
66
+ flags = [sigmoid(s) > 0.5 for s in output[0][0].detach().tolist()]
67
+ topic = [class_mapping[n] for n, i in enumerate(flags) if i]
68
+ topics.append(','.join(topic))
69
+ return topics
70
+
71
+
72
+ def sentiment_score(subsentences):
73
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
74
+ model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
75
+ from transformers import pipeline
76
+ sentiment_task = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
77
+ senti = []
78
+ for sen in subsentences:
79
+ a=sentiment_task(sen)
80
+ # [{'label': 'positive', 'score': 0.9484752416610718}]
81
+ a=a[0]
82
+ senti.append(a['label']+' , '+str(a['score']))
83
+ return senti
84
+
85
+
86
+
87
+ def intent_identify(subsentences):
88
+ model_name = 'cartesinus/fedcsis-intent_baseline-xlm_r-en'
89
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
90
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
91
+ classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
92
+ intents = []
93
+ for s in subsentences:
94
+ res = classifier(s)
95
+ a=res[0]
96
+ intents.append(a['label']+' , '+str(a['score']))
97
+ return intents
98
+
99
+
100
+
101
+ def entity_identify(subsentences):
102
+ # load the NER tagger
103
+ tagger = SequenceTagger.load('ner')
104
+ # create a sentence to analyze
105
+ entities = []
106
+ for sentence in subsentences:
107
+ sentence = Sentence(sentence)
108
+ # run NER on the sentence
109
+ tagger.predict(sentence)
110
+ # print the entities found in the sentence
111
+ ent = []
112
+ for entity in sentence.get_spans('ner'):
113
+ ent.append(entity.text)
114
+ entities.append(','.join(ent))
115
+ return entities
116
+
117
+
118
+
119
+ def keyword_identify(subsentences):
120
+ class KeywordExtractor:
121
+ def __init__(self):
122
+ self.stop_words = set(stopwords.words('english'))
123
+ def extract_keywords(self, text):
124
+ # tokenize sentences
125
+ sentences = sent_tokenize(text)
126
+ # tokenize words and remove stop words
127
+ words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.lower() not in self.stop_words and word.isalpha()]
128
+ # count word frequencies
129
+ word_freq = Counter(words)
130
+ # sort words by frequency
131
+ sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
132
+ # return top 3 keywords
133
+ return [word[0] for word in sorted_words[:2]]
134
+ key = KeywordExtractor()
135
+ keywords=[]
136
+ for s in subsentences:
137
+ keyword = key.extract_keywords(s)
138
+ keywords.append(','.join(keyword))
139
+ return keywords
140
+ st.markdown("<h1 style='text-align: center; color: white; background : grey'>Process Fest</h1>", unsafe_allow_html=True)
141
+ import pandas as pd
142
+ import numpy as np
143
+ sent = st.text_input(label = 'Enter the Text:')
144
+ button = st.button('submit')
145
+ #sent = "The stay at AAA was good The food was not that bad but the service was very bad and I prefer BBB than AAA I’ll raise a complaint against AAA"
146
+ if button:
147
+ subsentences = divide_sentence(sent)
148
+ topic = topic_identify(subsentences)
149
+ sentiment = sentiment_score(subsentences)
150
+ intent = intent_identify(subsentences)
151
+ entity = entity_identify(subsentences)
152
+ keyword = keyword_identify(subsentences)
153
+ df = pd.DataFrame(
154
+ {
155
+ 'subsentences': subsentences,
156
+ 'sentiment and score': sentiment,
157
+ 'intent': intent,
158
+ 'entity' : entity,
159
+ 'keyword' : keyword
160
+ })
161
+ st.dataframe(data=df, width=None, height=None,use_container_width=False)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ nltk == 3.7
2
+ torch == 1.13.1
3
+ transformers == 4.25.1
4
+ flair == 0.12.1
5
+ streamlit
6
+ pandas
7
+ numpy