Spaces:
Sleeping
Sleeping
File size: 4,214 Bytes
6060e42 3ad2786 6060e42 db6eed1 6060e42 5ac5229 6060e42 5ac5229 6060e42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import numpy as np
import transformers
import streamlit as st
from streamlit import session_state
import json
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch import cuda
import numpy
import emoji
import string
import bs4
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # PorterStemmer LancasterStemmer
from nltk.stem import WordNetLemmatizer
import re
stemmer = PorterStemmer()
# uncomment this when run first time
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
from transformers import pipeline
stopwords = nltk.corpus.stopwords.words('english')
classifier = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
def pre_processing_str_esg(df_col):
df_col = df_col.lower()
#defining the function to remove punctuation
def remove_punctuation(text):
punctuationfree="".join([i for i in text if i not in string.punctuation])
return punctuationfree
#storing the puntuation free text
df_col= remove_punctuation(df_col)
df_col = re.sub(r"http\S+", " ", df_col)
def remove_stopwords(text):
return " ".join([word for word in str(text).split() if word not in stopwords])
#applying the function
df_col = remove_stopwords(df_col)
df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
df_col = df_col.replace("¶", "")
df_col = df_col.replace("§", "")
df_col = df_col.replace('“', ' ')
df_col = df_col.replace('”', ' ')
df_col = df_col.replace('-', ' ')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
df_col = BAD_SYMBOLS_RE.sub(' ',df_col)
# df_col = re.sub('W*dw*','',df_col)
df_col = re.sub('[0-9]+', ' ', df_col)
df_col = re.sub(' ', ' ', df_col)
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
df_col = remove_emoji(df_col)
return df_col
def pre_processing_str(df_col):
# df_col = df_col.lower()
if len(df_col.split()) >= 70:
return pre_processing_str_esg(df_col)
else:
df_col = df_col.replace('#', '')
df_col = df_col.replace('!', '')
df_col = re.sub(r"http\S+", " ", df_col)
df_col = re.sub('[0-9]+', ' ', df_col)
df_col = re.sub(' ', ' ', df_col)
def remove_emojis(text):
return emoji.replace_emoji(text)
df_col = remove_emojis(df_col)
df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col)
df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)
df_col = df_col.strip()
return df_col
# start for the api steps make sure name should me match with file name and application = Flask(__name__). 'application.py and application
def process(text):
text = pre_processing_str(text)
try:
if len(text) != 0:
results = classifier(text, top_k = 2)
else:
results = 'No Text'
return {'output_16':results}
except:
return {'output_16':'something went wrong'}
st.set_page_config(page_title="core_risk", page_icon="📈")
if 'topic_class' not in session_state:
session_state['topic_class']= ""
st.title("Core Risk Category Classifier")
text= st.text_area(label= "Please write the text bellow",
placeholder="What does the text say?")
def classify(text):
session_state['topic_class'] = process(text)
st.text_area("result", value=session_state['topic_class'])
st.button("Classify", on_click=classify, args=[text])
|