File size: 9,215 Bytes
5f01a56
0a215d5
b4c0a34
230678a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72216f4
659f8c9
c4aa0f7
72216f4
 
c4aa0f7
72216f4
 
 
c4aa0f7
 
 
 
b4c0a34
6a4a9f2
c4aa0f7
e786d48
92010d3
6a4a9f2
b4c0a34
6a4a9f2
 
b4c0a34
6a4a9f2
b4c0a34
 
e7531a2
6a4a9f2
 
 
e7531a2
 
e786d48
97ba385
 
b4c0a34
6a4a9f2
 
07c6884
b4c0a34
6a4a9f2
92458f0
6a4a9f2
b4c0a34
6a4a9f2
 
 
 
 
 
 
 
 
 
 
c3b06d1
6a4a9f2
c3b06d1
6a4a9f2
 
 
335b610
6a4a9f2
 
 
c3b06d1
 
6a4a9f2
1b304ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

id_to_cat = {0: 'High Energy Physics - Theory',
             1: 'Category Theory',
             2: 'Methodology',
             3: 'Formal Languages and Automata Theory',
             4: 'Robotics',
             5: 'Fluid Dynamics',
             6: 'Spectral Theory',
             7: 'Econometrics',
             8: 'Programming Languages',
             9: 'Discrete Mathematics',
             10: 'Networking and Internet Architecture',
             11: 'Quantum Gases',
             12: 'Data Structures and Algorithms',
             13: 'Databases',
             14: 'Earth and Planetary Astrophysics',
             15: 'Optimization and Control',
             16: 'Biomolecules',
             17: 'Cryptography and Security',
             18: 'Geometric Topology',
             19: 'Other Condensed Matter',
             20: 'Statistical Mechanics',
             21: 'Analysis of PDEs',
             22: 'Quantitative Methods',
             23: 'Artificial Intelligence',
             24: 'Classical Analysis and ODEs',
             25: 'Machine Learning',
             26: 'Combinatorics',
             27: 'Pattern Formation and Solitons',
             28: 'Solar and Stellar Astrophysics',
             29: 'Audio and Speech Processing',
             30: 'Computer Science and Game Theory',
             31: 'Mesoscale and Nanoscale Physics',
             32: 'Instrumentation and Methods for Astrophysics',
             33: 'Logic',
             34: 'General Relativity and Quantum Cosmology',
             35: 'Differential Geometry',
             36: 'Graphics',
             37: 'Logic in Computer Science',
             38: 'Materials Science',
             39: 'Computational Finance',
             40: 'General Literature',
             41: 'Tissues and Organs',
             42: 'Digital Libraries',
             43: 'Sound',
             44: 'Computational Engineering, Finance, and Science',
             45: 'Biological Physics',
             46: 'Algebraic Geometry',
             47: 'Genomics',
             48: 'Algebraic Topology',
             49: 'Mathematical Software',
             50: 'Cosmology and Nongalactic Astrophysics',
             51: 'Probability',
             52: 'Data Analysis, Statistics and Probability',
             53: 'Classical Physics',
             54: 'Image and Video Processing',
             55: 'Neural and Evolutionary Computing',
             56: 'History and Philosophy of Physics',
             57: 'Astrophysics of Galaxies',
             58: 'Molecular Networks',
             59: 'Cellular Automata and Lattice Gases',
             60: 'Optics',
             61: 'General Finance',
             62: 'Mathematical Physics',
             63: 'Multimedia',
             64: 'Computational Physics',
             65: 'Performance',
             66: 'History and Overview',
             67: 'Instrumentation and Detectors',
             68: 'Computer Vision and Pattern Recognition',
             69: 'Medical Physics',
             70: 'Quantum Physics',
             71: 'Number Theory',
             72: 'Social and Information Networks',
             73: 'Populations and Evolution',
             74: 'High Energy Physics - Lattice',
             75: 'Pricing of Securities',
             76: 'Nuclear Theory',
             77: 'Human-Computer Interaction',
             78: 'Representation Theory',
             79: 'Geophysics',
             80: 'Operator Algebras',
             81: 'Computational Complexity',
             82: 'Distributed, Parallel, and Cluster Computing',
             83: 'Software Engineering',
             84: 'Computational Geometry',
             85: 'Cell Behavior',
             86: 'Quantum Algebra',
             87: 'Hardware Architecture',
             88: 'Strongly Correlated Electrons',
             89: 'Portfolio Management',
             90: 'General Topology',
             91: 'Statistical Finance',
             92: 'Computation and Language',
             93: 'Atmospheric and Oceanic Physics',
             94: 'Multiagent Systems',
             95: 'Rings and Algebras',
             96: 'Nuclear Experiment',
             97: 'Space Physics',
             98: 'Risk Management',
             99: 'General Mathematics',
             100: 'Other Statistics',
             101: 'Symbolic Computation',
             102: 'High Energy Physics - Phenomenology',
             103: 'Popular Physics',
             104: 'Functional Analysis',
             105: 'Economics',
             106: 'Computation',
             107: 'Operating Systems',
             108: 'Complex Variables',
             109: 'Applications',
             110: 'Information Theory',
             111: 'Physics and Society',
             112: 'Other Computer Science',
             113: 'Metric Geometry',
             114: 'Signal Processing',
             115: 'Information Retrieval',
             116: 'Numerical Analysis',
             117: 'Chemical Physics',
             118: 'Trading and Market Microstructure',
             119: 'Soft Condensed Matter',
             120: 'Computers and Society',
             121: 'General Physics',
             122: 'Superconductivity',
             123: 'Statistics Theory',
             124: 'Emerging Technologies',
             125: 'High Energy Astrophysical Phenomena',
             126: 'Other Quantitative Biology',
             127: 'High Energy Physics - Experiment',
             128: 'Commutative Algebra',
             129: 'Applied Physics',
             130: 'Dynamical Systems',
             131: 'Adaptation and Self-Organizing Systems',
             132: 'Neurons and Cognition',
             133: 'Subcellular Processes',
             134: 'Chaotic Dynamics',
             135: 'Group Theory',
             136: 'Systems and Control',
             137: 'Disordered Systems and Neural Networks'
            }

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
    model = AutoModelForSequenceClassification.from_pretrained(
        'checkpoint',
        num_labels=len(id_to_cat),
        problem_type="multi_label_classification"
    )
    return model, tokenizer

try:
    model, tokenizer = load_model()
except OSError as e:
    st.error(f"Ошибка при загрузке модели: {e}")
    st.stop()

def classify_text(title, description):
    text = f"{title.strip()} {description.strip()}"
    try:
        classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=len(id_to_cat))
        results = classifier(text)
    except Exception as e:
        st.error(f"Ошибка при классификации текста: {e}")
        return []

    res = [
        (id_to_cat[int(entry['label'].split('_')[1])], entry['score'])
        for entry in results[0]
    ]
    total = sum(score for _, score in res)
    return [(label, score / total) for label, score in res]

st.title("🔬 Классификация англоязычных научных статей")
st.markdown("Введите заголовок и краткое описание научной статьи, чтобы определить её тематические категории.")

title = st.text_input("📝 Заголовок статьи", placeholder="Например: Deep Learning for Image Recognition")
description = st.text_area("🧾 Краткое описание статьи", height=150, placeholder="Кратко опишите содержание статьи...")
top_percent = st.text_input("📊 Порог суммарной вероятности (например, 95 или 0.95 для top 95%)", value="95")

if st.button("🚀 Классифицировать"):
    if not title and not description:
        st.warning("Пожалуйста, введите заголовок или описание статьи.")
    else:
        try:
            t = float(top_percent)
            if t > 1:
                t = t / 100
            if not (0 < t <= 1):
                raise ValueError()
        except ValueError:
            st.warning("Некорректное значение для порога вероятности. Используем значение по умолчанию: 95%.")
            t = 0.95

        with st.spinner("🔍 Классификация..."):
            results = classify_text(title, description)

            if results:
                cumulative_prob = 0.0
                st.subheader(f"📚 Топ категорий (до {int(t*100)}% совокупной вероятности):")
                for label, score in results:
                    st.write(f"- **{label}**: {score*100:.2f}%")
                    cumulative_prob += score
                    if cumulative_prob >= t:
                        break
            else:
                st.info("Не удалось получить результаты классификации.")
elif title or description:
    st.warning("Нажмите кнопку 'Классифицировать', чтобы получить результат.")