|
import torch |
|
from transformers import AutoModel, AutoConfig |
|
import joblib |
|
import pickle |
|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
excel_path = "./Dict.xlsx" |
|
keywords = pd.read_excel(excel_path) |
|
|
|
keywords = keywords.assign(Words = keywords.Words.str.lower()) |
|
keywords = keywords.assign(Words = keywords.Words.str.split(",")).explode("Words") |
|
keywords = keywords.assign(Words = keywords.Words.str.strip()) |
|
keywords = keywords.assign(Fields = keywords.Fields.str.strip()) |
|
|
|
|
|
model_path = "./classifier.joblib" |
|
vectorizer_path = "./vectorizer" |
|
|
|
classifier = joblib.load(model_path) |
|
with open(vectorizer_path, "rb") as f: |
|
tfidf_vectorizer = pickle.load(f) |
|
|
|
|
|
import gradio as gr |
|
def classify_text(text): |
|
|
|
import pandas as pd |
|
import numpy as np |
|
from snowballstemmer import stemmer |
|
import re |
|
import nltk |
|
|
|
Sample = pd.DataFrame({"Description": [text]}) |
|
Sample = Sample.assign(Description = Sample.Description.str.lower()) |
|
Sample = Sample.assign(Description = Sample.Description.str.lower()) |
|
Sample = Sample.reset_index(names = "n").reset_index(names = "Doc").drop("n",axis = 1) |
|
Sample = Sample.assign(Description = Sample.Description.astype("str")) |
|
Sample = Sample.groupby(["Doc"]).Description.apply(" ".join).reset_index() |
|
Sample['Description'] = Sample['Description'].str.strip() |
|
keyword_mapping = {} |
|
for keyword, field in zip(keywords['Words'], keywords['Fields']): |
|
keyword_mapping.setdefault(keyword, []).append(field) |
|
|
|
def find_field(description): |
|
matching_fields = set() |
|
for keyword, fields in keyword_mapping.items(): |
|
if re.search(r'\b{}\b'.format(re.escape(keyword)), description): |
|
matching_fields.update(fields) |
|
return list(matching_fields)[0] if matching_fields else "None" |
|
|
|
|
|
Sample['AssignedField'] = Sample['Description'].apply(find_field) |
|
done = Sample.loc[~Sample.AssignedField.isna()] |
|
pos = Sample.loc[Sample.AssignedField.isna()].Doc |
|
|
|
nltk.download('stopwords') |
|
stopword_es = nltk.corpus.stopwords.words('spanish') |
|
stemmer_es = stemmer("spanish") |
|
|
|
Sample = pd.DataFrame({"Description": [text]}) |
|
Sample = Sample.assign(Description = Sample.Description.str.lower()) |
|
Sample = Sample.reset_index(names = "n").reset_index(names = "Doc").drop("n",axis = 1) |
|
|
|
Sample = Sample.assign(Description = Sample.Description.str.split()).explode("Description") |
|
Sample = Sample.assign(Description = Sample.Description.str.replace('([&%βββ’:β>.β¬Βͺ,;()-/])','')) |
|
Sample = Sample.assign(Description = Sample.Description.str.replace('"','')) |
|
Sample = Sample.assign(Description = Sample.Description.str.replace('([ββ\[\]\d+])','')) |
|
Sample = Sample.reset_index() |
|
|
|
positions = [i for i, desc in enumerate(Sample.Description) if desc in stopword_es] |
|
Sample.drop(positions, inplace=True) |
|
|
|
stemmer_es = stemmer("spanish") |
|
Sample["Description"] = Sample["Description"].apply(lambda word: stemmer_es.stemWord(word)) |
|
|
|
Sample = Sample.assign(Description = Sample.Description.astype("str")) |
|
Sample = Sample.groupby(["Doc"]).Description.apply(" ".join).reset_index() |
|
|
|
new_text_tfidf = tfidf_vectorizer.transform(Sample["Description"]) |
|
|
|
|
|
predicted_probabilities = classifier.predict_proba(new_text_tfidf) |
|
|
|
|
|
top_four_predictions = [] |
|
for probs in predicted_probabilities: |
|
|
|
top_four_indices = sorted(range(len(probs)), key=lambda i: probs[i], reverse=True)[:3] |
|
|
|
|
|
top_four_classes = [classifier.classes_[i] for i in top_four_indices] |
|
|
|
top_four_predictions.append(top_four_classes) |
|
|
|
preds = pd.DataFrame(top_four_predictions).rename(columns = {0:"Pred_1", |
|
1:"Pred_2", |
|
2: "Pred_3"}) |
|
|
|
results_1 = pd.concat([Sample.reset_index(),preds],axis = 1).drop("index",axis = 1).assign(AssignedField = "None")[["Doc","Description","AssignedField","Pred_1","Pred_2","Pred_3"]] |
|
done[["Pred_1","Pred_2","Pred_3"]] = ["None","None","None"] |
|
final_results = results_1.merge(done, on = ["Description"], how = "left") |
|
return done["AssignedField"][0],results_1["Pred_1"][0],results_1["Pred_2"][0],results_1["Pred_3"][0] |
|
|
|
|
|
iface = gr.Interface( |
|
fn=classify_text, |
|
inputs=gr.Textbox(label = "Description", placeholder= "Enter your Service Description", lines = 3), |
|
outputs=([gr.Textbox() for _ in range(4)]), |
|
title="Text Classification App", |
|
theme = gr.themes.Default(primary_hue="blue"), |
|
examples = [["ilaap"],["finrep"],["liquidez"],["Data governance"]]) |
|
|
|
iface.launch() |
|
|