import numpy as np import transformers import streamlit as st from streamlit import session_state import json import pandas as pd from transformers import BertTokenizer, BertModel from torch import cuda import numpy import emoji import string import bs4 import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer # PorterStemmer LancasterStemmer from nltk.stem import WordNetLemmatizer import re stemmer = PorterStemmer() # uncomment this when run first time nltk.download('wordnet') nltk.download('omw-1.4') nltk.download('stopwords') lemmatizer = WordNetLemmatizer() from transformers import pipeline stopwords = nltk.corpus.stopwords.words('english') classifier = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch") def pre_processing_str_esg(df_col): df_col = df_col.lower() #defining the function to remove punctuation def remove_punctuation(text): punctuationfree="".join([i for i in text if i not in string.punctuation]) return punctuationfree #storing the puntuation free text df_col= remove_punctuation(df_col) df_col = re.sub(r"http\S+", " ", df_col) def remove_stopwords(text): return " ".join([word for word in str(text).split() if word not in stopwords]) #applying the function df_col = remove_stopwords(df_col) df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col) df_col = df_col.replace("ΒΆ", "") df_col = df_col.replace("Β§", "") df_col = df_col.replace('β€œ', ' ') df_col = df_col.replace('”', ' ') df_col = df_col.replace('-', ' ') REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col) df_col = BAD_SYMBOLS_RE.sub(' ',df_col) # df_col = re.sub('W*dw*','',df_col) df_col = re.sub('[0-9]+', ' ', df_col) df_col = re.sub(' ', ' ', df_col) def remove_emoji(string): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', string) df_col = remove_emoji(df_col) return df_col def pre_processing_str(df_col): # df_col = df_col.lower() if len(df_col.split()) >= 70: return pre_processing_str_esg(df_col) else: df_col = df_col.replace('#', '') df_col = df_col.replace('!', '') df_col = re.sub(r"http\S+", " ", df_col) df_col = re.sub('[0-9]+', ' ', df_col) df_col = re.sub(' ', ' ', df_col) def remove_emojis(text): return emoji.replace_emoji(text) df_col = remove_emojis(df_col) df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col) df_col = re.sub(r"[^\x20-\x7E]+", "", df_col) df_col = df_col.strip() return df_col # start for the api steps make sure name should me match with file name and application = Flask(__name__). 'application.py and application def process(text): text = pre_processing_str(text) try: if len(text) != 0: results = classifier(text, top_k = 2) else: results = 'No Text' return {'output_16':results} except: return {'output_16':'something went wrong'} st.set_page_config(page_title="core_risk", page_icon="πŸ“ˆ") if 'topic_class' not in session_state: session_state['topic_class']= "" st.title("Core Risk Category Classifier") text= st.text_area(label= "Please write the text bellow", placeholder="What does the text say?") def classify(text): session_state['topic_class'] = process(text) st.text_area("result", value=session_state['topic_class']) st.button("Classify", on_click=classify, args=[text])