AritORR commited on
Commit
c53610b
·
1 Parent(s): 4ada67c
Files changed (1) hide show
  1. app.py +59 -23
app.py CHANGED
@@ -1,30 +1,66 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
 
 
 
 
 
4
 
5
- model_name = "ai-sage/GigaChat-20B-A3B-instruct-v1.5-int8"
6
 
7
- bnb_config = BitsAndBytesConfig(
8
- load_in_4bit=True,
9
- bnb_4bit_quant_type="nf4",
10
- bnb_4bit_compute_dtype=torch.bfloat16,
11
- bnb_4bit_use_double_quant=True,
12
- llm_int8_enable_fp32_cpu_offload=True
13
- )
14
 
15
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
16
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, quantization_config=bnb_config, device_map="auto")
17
- model.generation_config = GenerationConfig.from_pretrained(model_name)
 
18
 
19
- messages = [
20
- {"role": "user", "content": "Докажи теорему о неподвижной точке"}
21
- ]
22
- input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
23
- outputs = model.generate(input_tensor.to(model.device))
24
 
25
- result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=False)
 
26
 
27
- with gr.Blocks() as demo:
28
- gr.Textbox(result, label="Вывод строки", interactive=False)
29
 
30
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import evaluate
3
+ import pandas as pd
4
+ import numpy as np
5
+ from datasets import Dataset
6
+ from sklearn.model_selection import train_test_split
7
+ from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
8
+ TrainingArguments, Trainer)
9
 
10
+ model_name = "DeepPavlov/rubert-base-cased"
11
 
12
+ # Login using e.g. `huggingface-cli login` to access this dataset
13
+ splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
14
+ df = pd.read_parquet("hf://datasets/mteb/RuSciBenchOECDClassification/" + splits["train"])
 
 
 
 
15
 
16
+ # Конвертируем датафрейм в Dataset
17
+ train, test = train_test_split(df, test_size=0.2)
18
+ train = Dataset.from_pandas(train)
19
+ test = Dataset.from_pandas(test)
20
 
21
+ # Выполняем предобработку текста
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
23
 
24
+ def tokenize_function(examples):
25
+ return tokenizer(examples['text'], padding='max_length', truncation=True)
26
 
27
+ tokenized_train = train.map(tokenize_function)
28
+ tokenized_test = test.map(tokenize_function)
29
 
30
+ # Загружаем предобученную модель
31
+ model = AutoModelForSequenceClassification.from_pretrained(
32
+ model_name,
33
+ num_labels=28)
34
+
35
+ # Задаем параметры обучения
36
+ training_args = TrainingArguments(
37
+ output_dir = 'test_trainer_log',
38
+ evaluation_strategy = 'epoch',
39
+ per_device_train_batch_size = 6,
40
+ per_device_eval_batch_size = 6,
41
+ num_train_epochs = 5,
42
+ report_to='none')
43
+
44
+ # Определяем как считать метрику
45
+ metric = evaluate.load('f1')
46
+ def compute_metrics(eval_pred):
47
+ logits, labels = eval_pred
48
+ predictions = np.argmax(logits, axis=-1)
49
+ return metric.compute(predictions=predictions, references=labels)
50
+
51
+ # Выполняем обучение
52
+ trainer = Trainer(
53
+ model = model,
54
+ args = training_args,
55
+ train_dataset = tokenized_train,
56
+ eval_dataset = tokenized_test,
57
+ compute_metrics = compute_metrics)
58
+
59
+ trainer.train()
60
+
61
+ # Сохраняем модель
62
+ save_directory = './pt_save_pretrained'
63
+ #tokenizer.save_pretrained(save_directory)
64
+ model.save_pretrained(save_directory)
65
+ #alternatively save the trainer
66
+ #trainer.save_model('CustomModels/CustomHamSpam')