rhemzypm
/

BERT_klasifikasi_risiko

Model card Files Files and versions Community

BERT_klasifikasi_risiko / app.py

rhemzypm

Upload app.py

cb3a755 verified 2 months ago

raw

history blame contribute delete

No virus

3.66 kB

	import gradio as gr
	import torch
	import joblib
	import numpy as np
	import pandas as pd
	from transformers import AutoTokenizer, AutoModel

	# Load IndoBERT tokenizer
	tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")

	# Load IndoBERT model
	model = AutoModel.from_pretrained("indolem/indobert-base-uncased")

	# Mapping dictionaries for labels
	priority_score_mapping = {1: "LOW", 2: "MEDIUM", 3: "HIGH"}
	problem_domain_mapping = {0: "OPERATIONAL", 1: "TECHNICAL"}

	# Load the trained Random Forest models
	best_classifier1 = joblib.load('best_classifier1_optimized.pkl')
	best_classifier2 = joblib.load('best_classifier2_optimized.pkl')

	markdown_text = '''
	## Label Description
	### Priority Score
	* Low label, means that the temporary/corrective solution can solve the problem. A permanent solution will be provided later because the impact on the business can still be handled.
	* Medium label, means that there's a need to determine the time constraint to solve the problem. If it remains too long, it will impact the business side.
	* High label, means that the problem is urgent and must be solved immediately.
	### Problem Domain
	* Operational label, means that the scope of the problem is on the business or daily operational.
	* Technical label, means that the scope of the problem is on the technical (technology) side like the mobile/web application.
	'''

	description="Write the feedback about the capsule hotel that you've ever visited or stayed there. The machine learning model will predict the priority score and problem domain of the feedback."

	# Function to perform predictions
	def predict(text):
	# Convert the sentences into input features
	encoded_inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=128)

	# Perform word embedding using IndoBERT model
	with torch.no_grad():
	outputs = model(**encoded_inputs)
	embeddings = outputs.last_hidden_state

	# Convert the embeddings to numpy array
	embeddings = embeddings.numpy()

	embeddings_custom_flat = embeddings.reshape(embeddings.shape[0], -1)

	# Ensure mean_pooled_embeddings has exactly 768 features
	num_features_expected = 768
	if embeddings_custom_flat.shape[1] < num_features_expected:
	# If the number of features is less than 768, pad the embeddings
	pad_width = num_features_expected - embeddings_custom_flat.shape[1]
	embeddings_custom_flat = np.pad(embeddings_custom_flat, ((0, 0), (0, pad_width)), mode='constant')

	elif embeddings_custom_flat.shape[1] > num_features_expected:
	# If the number of features is more than 768, truncate the embeddings
	embeddings_custom_flat = embeddings_custom_flat[:, :num_features_expected]

	# Predict the priority_score for the custom input
	custom_priority_score = best_classifier1.predict(embeddings_custom_flat)

	# Predict the problem_domain for the custom input
	custom_problem_domain = best_classifier2.predict(embeddings_custom_flat)

	# Map numerical labels to human-readable labels
	mapped_priority_score = priority_score_mapping.get(custom_priority_score[0], "unknown")
	mapped_problem_domain = problem_domain_mapping.get(custom_problem_domain[0], "unknown")

	return f"Predicted Priority Score: {mapped_priority_score}, Predicted Problem Domain: {mapped_problem_domain}"


	# Create a Gradio interface
	gr.Interface(fn=predict, inputs="text", outputs="text", title="Simple Risk Classifier Demo (Case Study: Capsule Hotel)", description=description, article=markdown_text).launch(debug=True)