Spaces:

planetearth79
/

ysda_hw

Sleeping

App Files Files Community

planetearth79 commited on Apr 17, 2023

Commit

8d358a1

1 Parent(s): b982018

Upload 14 files

Browse files

Files changed (14) hide show

app.py +107 -0
multi_class_model/config.json +48 -0
multi_class_model/pytorch_model.bin +3 -0
multi_class_model/special_tokens_map.json +1 -0
multi_class_model/tokenizer.json +0 -0
multi_class_model/tokenizer_config.json +1 -0
multi_class_model/trainer_state.json +46 -0
multi_class_model/vocab.txt +0 -0
multi_label_model/config.json +48 -0
multi_label_model/pytorch_model.bin +3 -0
multi_label_model/special_tokens_map.json +1 -0
multi_label_model/tokenizer.json +0 -0
multi_label_model/tokenizer_config.json +1 -0
multi_label_model/vocab.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import streamlit as st
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+import torch
+import pandas as pd
+import numpy as np
+st.markdown("# Arxiv Papers Classifier")
+st.markdown("<img width=200px src='https://blog.arxiv.org/files/2021/02/arxiv-logo.svg'>", unsafe_allow_html=True)
+st.markdown("После обработки и фильтрации датасета у каждой статьи остался один или несколько классов из 9:")
+st.markdown("""
+1) ai
+2) cs
+3) cv
+4) lg
+5) math
+6) ml
+7) phys
+8) q-bio
+9) stat
+""")
+id2label = {
+    0: "ai",
+    1: "cs",
+    2: "cv",
+    3: "lg",
+    4: "math",
+    5: "ml",
+    6: "phys",
+    7: "q-bio",
+    8: "stat"
+  }
+title_text = st.text_input("ENTER TITLE HERE")
+summary_text = st.text_area("ENTER SUMMARY HERE")
+text = title_text + " " + summary_text
+# 1
+@st.cache_resource
+def load_first_model():
+    loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model")
+    loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model")
+    return loaded_tokenizer, loaded_model
+tokenizer_1, model_1 = load_first_model()
+# loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model")
+# loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model")
+st.markdown("## multi-class classification")
+text_input = tokenizer_1(text, padding="max_length", truncation=True, return_tensors='pt')
+with torch.no_grad():
+    text_res = model_1(**text_input)
+text_probs = torch.softmax(text_res.logits, dim=1).cpu().numpy()[0]
+order = np.argsort(text_probs)[::-1]
+ordered_text_probs = text_probs[order]
+idxs = order[np.cumsum(ordered_text_probs) <= 0.95]
+st.markdown("Топ-95 классов: " + ", ".join([id2label[i] for i in idxs]))
+chart_data = pd.DataFrame(
+    text_probs,
+    columns=['class probability'])
+chart_data["index"] = np.array(list(id2label.values()))
+chart_data = chart_data.set_index("index")
+st.bar_chart(chart_data)
+# 2
+@st.cache_resource
+def load_first_model():
+    loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model")
+    loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model")
+    return loaded_tokenizer, loaded_model
+tokenizer_2, model_2 = load_first_model()
+# loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model")
+# loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model")
+st.markdown("## multi-label classification")
+text_input = tokenizer_2(text, padding="max_length", truncation=True, return_tensors='pt')
+with torch.no_grad():
+    text_res = model_2(**text_input)
+text_probs = torch.sigmoid(torch.Tensor(text_res.logits)).cpu().numpy()[0]
+probs = np.stack([text_probs, 1 - text_probs], axis=1)
+chart_data = pd.DataFrame(
+    probs,
+    columns=['belong', "not belong"])
+chart_data["index"] = np.array(list(id2label.values()))
+chart_data = chart_data.set_index("index")
+st.markdown("Probabilities for each class")
+st.bar_chart(chart_data)

multi_class_model/config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_name_or_path": "distilbert-base-cased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.14.0",
+  "vocab_size": 28996
+}

multi_class_model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be39938c1ff4164d67f61b50aec81d799e07f297de6d627e0701dee0f2594aa5
+size 263193713

multi_class_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

multi_class_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

multi_class_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "distilbert-base-cased", "tokenizer_class": "DistilBertTokenizer"}

multi_class_model/trainer_state.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "best_metric": 1.0723919868469238,
+  "best_model_checkpoint": "clf_trainer/checkpoint-6560",
+  "epoch": 2.0,
+  "global_step": 6560,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.0,
+      "learning_rate": 4.5e-05,
+      "loss": 1.1784,
+      "step": 3280
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.6109756097560975,
+      "eval_loss": 1.100429654121399,
+      "eval_runtime": 43.9741,
+      "eval_samples_per_second": 186.474,
+      "eval_steps_per_second": 18.647,
+      "step": 3280
+    },
+    {
+      "epoch": 2.0,
+      "learning_rate": 4e-05,
+      "loss": 1.0022,
+      "step": 6560
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.6201219512195122,
+      "eval_loss": 1.0723919868469238,
+      "eval_runtime": 43.854,
+      "eval_samples_per_second": 186.984,
+      "eval_steps_per_second": 18.698,
+      "step": 6560
+    }
+  ],
+  "max_steps": 32800,
+  "num_train_epochs": 10,
+  "total_flos": 8690946151219200.0,
+  "trial_name": null,
+  "trial_params": null
+}

multi_class_model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

multi_label_model/config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_name_or_path": "distilbert-base-cased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "ai",
+    "1": "cs",
+    "2": "cv",
+    "3": "lg",
+    "4": "math",
+    "5": "ml",
+    "6": "phys",
+    "7": "q-bio",
+    "8": "stat"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "ai": 0,
+    "cs": 1,
+    "cv": 2,
+    "lg": 3,
+    "math": 4,
+    "ml": 5,
+    "phys": 6,
+    "q-bio": 7,
+    "stat": 8
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "problem_type": "multi_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.14.0",
+  "vocab_size": 28996
+}

multi_label_model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0252ffb53aac0c0bf85efe4253b68e1756ab2241ef152c28384256cd4a856828
+size 263193713

multi_label_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

multi_label_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

multi_label_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "distilbert-base-cased", "tokenizer_class": "DistilBertTokenizer"}

multi_label_model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff