Spaces:

MostoHF
/

article_sphere_classification

Sleeping

App Files Files Community

article_sphere_classification / app.py

MostoHF

Update app.py

2472ad0 verified 3 months ago

raw

history blame contribute delete

3.05 kB

	import streamlit as st
	import torch
	from torch import nn
	import csv
	from transformers import AutoModel, AutoTokenizer
	from huggingface_hub import hf_hub_download
	from model import ClassificationModel

	st.set_page_config(page_title="Article Theme Classifier", layout="centered")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	MAX_LENGTH = 512

	@st.cache_resource
	def get_model():
	base_model = AutoModel.from_pretrained("distilbert-base-cased")
	class_model = ClassificationModel(base_model)

	weights_path = hf_hub_download(
	repo_id="MostoHF/TunedDistillBertCased",
	filename="pytorch_model.bin"
	)

	state_dict = torch.load(weights_path, map_location=device)
	class_model.load_state_dict(state_dict)
	class_model.to(device)
	class_model.eval()

	return class_model

	@st.cache_resource
	def get_tokenizer():
	return AutoTokenizer.from_pretrained("distilbert-base-cased")

	@st.cache_resource
	def get_ind_to_cat():
	ind_to_category_copy = {}
	with open('ind_to_category.csv', mode='r', newline='') as f:
	reader = csv.reader(f)
	next(reader) # skip header
	for key, value in reader:
	ind_to_category_copy[int(key)] = value # ключи — int
	return ind_to_category_copy

	class_model = get_model()
	tokenizer = get_tokenizer()
	ind_to_category = get_ind_to_cat()

	def inference(title, abstract, threshold=0.95):
	cur_elem = title + '@' + abstract

	encoding = tokenizer(cur_elem, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	input_ids = encoding["input_ids"].to(device)
	attention_mask = encoding["attention_mask"].to(device)

	with torch.no_grad():
	res_probs = torch.exp(class_model(input_ids, attention_mask))

	probs = res_probs.squeeze(0) # (8,)
	sorted_probs, sorted_indices = torch.sort(probs, descending=True)

	total = 0.0
	selected_indices = []
	selected_probs = []

	for prob, idx in zip(sorted_probs, sorted_indices):
	total += prob.item()
	selected_indices.append(idx.item())
	selected_probs.append(prob.item())
	if total >= threshold:
	break

	ans_themes = [ind_to_category[idx] for idx in selected_indices]
	return ans_themes, selected_probs


	# ------------------- Streamlit UI -------------------

	st.title("📄 Article Theme Classifier")

	title = st.text_input("Title", value="Введите title...")
	abstract = st.text_input("Abstract", value="Введите abstract...")
	threshold = st.slider("Выберите cumulative probability threshold", 0.0, 1.0, step=0.01, value=0.95)

	if st.button("Submit"):
	if title or abstract:
	st.success(f"✅ Title")
	st.info(f"📑 Abstract")
	themes, probs = inference(title, abstract, threshold)
	st.subheader("Predicted Themes:")
	for i in range(len(themes)):
	st.write(f"{themes[i]} — {probs[i]:.4f}")
	else:
	st.warning("❌ Please fill in at least one of the fields.")