NLP / app.py
luispuli's picture
Update app.py
f449f45 verified
raw
history blame contribute delete
No virus
5.16 kB
import torch
from transformers import AutoModel, AutoConfig
import joblib
import pickle
import pandas as pd
import numpy as np
excel_path = "./Dict.xlsx"
keywords = pd.read_excel(excel_path)
keywords = keywords.assign(Words = keywords.Words.str.lower())
keywords = keywords.assign(Words = keywords.Words.str.split(",")).explode("Words")
keywords = keywords.assign(Words = keywords.Words.str.strip())
keywords = keywords.assign(Fields = keywords.Fields.str.strip())
# Load the model (joblib file) and vectorizer (pickle file) from Hugging Face
model_path = "./classifier.joblib"
vectorizer_path = "./vectorizer"
classifier = joblib.load(model_path)
with open(vectorizer_path, "rb") as f:
tfidf_vectorizer = pickle.load(f)
import gradio as gr
def classify_text(text):
import pandas as pd
import numpy as np
from snowballstemmer import stemmer
import re
import nltk
Sample = pd.DataFrame({"Description": [text]})
Sample = Sample.assign(Description = Sample.Description.str.lower())
Sample = Sample.assign(Description = Sample.Description.str.lower())
Sample = Sample.reset_index(names = "n").reset_index(names = "Doc").drop("n",axis = 1)
Sample = Sample.assign(Description = Sample.Description.astype("str"))
Sample = Sample.groupby(["Doc"]).Description.apply(" ".join).reset_index()
Sample['Description'] = Sample['Description'].str.strip()
keyword_mapping = {}
for keyword, field in zip(keywords['Words'], keywords['Fields']):
keyword_mapping.setdefault(keyword, []).append(field)
# Function to find matching keywords in a description and return the first corresponding field
def find_field(description):
matching_fields = set()
for keyword, fields in keyword_mapping.items():
if re.search(r'\b{}\b'.format(re.escape(keyword)), description):
matching_fields.update(fields)
return list(matching_fields)[0] if matching_fields else "None"
# Apply the function to the 'Description' column of df_sample
Sample['AssignedField'] = Sample['Description'].apply(find_field)
done = Sample.loc[~Sample.AssignedField.isna()]
pos = Sample.loc[Sample.AssignedField.isna()].Doc
nltk.download('stopwords')
stopword_es = nltk.corpus.stopwords.words('spanish')
stemmer_es = stemmer("spanish")
Sample = pd.DataFrame({"Description": [text]})
Sample = Sample.assign(Description = Sample.Description.str.lower())
Sample = Sample.reset_index(names = "n").reset_index(names = "Doc").drop("n",axis = 1)
# Split into tokens and reshape the data to one row per token
Sample = Sample.assign(Description = Sample.Description.str.split()).explode("Description")
Sample = Sample.assign(Description = Sample.Description.str.replace('([&%—●‒:–>.€Βͺ,;()-/])',''))
Sample = Sample.assign(Description = Sample.Description.str.replace('"',''))
Sample = Sample.assign(Description = Sample.Description.str.replace('([β€œβ€\[\]\d+])',''))
Sample = Sample.reset_index()
# Filter Stopwords
positions = [i for i, desc in enumerate(Sample.Description) if desc in stopword_es]
Sample.drop(positions, inplace=True)
stemmer_es = stemmer("spanish")
Sample["Description"] = Sample["Description"].apply(lambda word: stemmer_es.stemWord(word))
Sample = Sample.assign(Description = Sample.Description.astype("str"))
Sample = Sample.groupby(["Doc"]).Description.apply(" ".join).reset_index()
new_text_tfidf = tfidf_vectorizer.transform(Sample["Description"])
# Make probability predictions for the new text documents
predicted_probabilities = classifier.predict_proba(new_text_tfidf)
# Get the top four predicted class labels for each text document
top_four_predictions = []
for probs in predicted_probabilities:
# Sort the probabilities and get the indices of the top four
top_four_indices = sorted(range(len(probs)), key=lambda i: probs[i], reverse=True)[:3]
# Get the corresponding class labels for the top four indices
top_four_classes = [classifier.classes_[i] for i in top_four_indices]
top_four_predictions.append(top_four_classes)
preds = pd.DataFrame(top_four_predictions).rename(columns = {0:"Pred_1",
1:"Pred_2",
2: "Pred_3"})
results_1 = pd.concat([Sample.reset_index(),preds],axis = 1).drop("index",axis = 1).assign(AssignedField = "None")[["Doc","Description","AssignedField","Pred_1","Pred_2","Pred_3"]]
done[["Pred_1","Pred_2","Pred_3"]] = ["None","None","None"]
final_results = results_1.merge(done, on = ["Description"], how = "left")
return done["AssignedField"][0],results_1["Pred_1"][0],results_1["Pred_2"][0],results_1["Pred_3"][0]
# Create a Gradio interface
iface = gr.Interface(
fn=classify_text,
inputs=gr.Textbox(label = "Description", placeholder= "Enter your Service Description", lines = 3),
outputs=([gr.Textbox() for _ in range(4)]),
title="Text Classification App",
theme = gr.themes.Default(primary_hue="blue"),
examples = [["ilaap"],["finrep"],["liquidez"],["Data governance"]]) # Use a compact theme
# Launch the Gradio interface
iface.launch()