Roaoch commited on
Commit
feeb971
1 Parent(s): 7e97035

From Deprecated

Browse files
.github/workflows/main.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+ workflow_dispatch:
6
+
7
+ jobs:
8
+ sync-to-hub:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v3
12
+ with:
13
+ fetch-depth: 0
14
+ lfs: true
15
+ - name: Push to hub
16
+ env:
17
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
18
+ run: git push --force https://Roaoch:$HF_TOKEN@huggingface.co/spaces/Roaoch/CyberClassic main
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+
5
+ WORKDIR /app
6
+
7
+ COPY ./startings.csv ./startings.csv
8
+ COPY ./src ./src
9
+ COPY ./requirements.txt ./requirements.txt
10
+ COPY ./main.py ./main.py
11
+
12
+ RUN pip install --upgrade pip
13
+ RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
14
+ RUN pip install pandas numpy transformers fastapi unicorn[standard]
15
+
16
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ from src.cyberclaasic import CyberClassic
4
+ from fastapi import FastAPI
5
+
6
+ warnings.simplefilter("ignore", UserWarning)
7
+
8
+ app = FastAPI()
9
+
10
+ text_generator = CyberClassic(
11
+ min_length=30,
12
+ max_length=50,
13
+ startings_path='./startings.csv'
14
+ )
15
+
16
+ @app.get("/")
17
+ def generete():
18
+ return {"text": str(text_generator.generate())}
19
+
20
+ @app.get('/answer')
21
+ def answer(promt: str):
22
+ return {"text": str(text_generator.answer(f'{promt}:\n'))}
requirements.txt ADDED
File without changes
src/cyberclaasic.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ import torch
3
+ import json
4
+
5
+ import pandas as pd
6
+
7
+ from src.discriminator import DiscriminatorModel
8
+
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2LMHeadModel, GenerationConfig
10
+
11
+ import numpy as np
12
+
13
+ class CyberClassic(torch.nn.Module):
14
+ def __init__(
15
+ self,
16
+ min_length: int,
17
+ max_length: int,
18
+ startings_path: str
19
+ ) -> None:
20
+ super().__init__()
21
+ self.min_length = min_length
22
+ self.max_length = max_length
23
+ self.startings = pd.read_csv(startings_path)
24
+
25
+ self.tokenizer = AutoTokenizer.from_pretrained('Roaoch/CyberClassic-Generator')
26
+ self.generator: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained('Roaoch/CyberClassic-Generator')
27
+ self.discriminator = DiscriminatorModel.from_pretrained('Roaoch/CyberClassic-Discriminator')
28
+
29
+ self.tokenizer.pad_token = self.tokenizer.eos_token
30
+ self.generation_config = GenerationConfig(
31
+ max_new_tokens=max_length,
32
+ num_beams=6,
33
+ early_stopping=True,
34
+ do_sample=True,
35
+ # top_k=60,
36
+ # penalty_alpha=0.6,
37
+ # top_p=0.95,
38
+ eos_token_id=self.tokenizer.eos_token_id,
39
+ pad_token=self.tokenizer.pad_token_id
40
+ )
41
+
42
+ def encode(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
43
+ last_hidden_state = self.generator(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)['hidden_states'][-1]
44
+ weights_for_non_padding = attention_mask * torch.arange(start=1, end=last_hidden_state.shape[1] + 1).unsqueeze(0)
45
+ sum_embeddings = torch.sum(last_hidden_state * weights_for_non_padding.unsqueeze(-1), dim=1)
46
+ num_of_none_padding_tokens = torch.sum(weights_for_non_padding, dim=-1).unsqueeze(-1)
47
+ return sum_embeddings / num_of_none_padding_tokens
48
+
49
+ def generate(self) -> str:
50
+ starts = self.startings['text'].values[np.random.randint(0, len(self.startings), 4)].tolist()
51
+ tokens = self.tokenizer(starts, return_tensors='pt', padding=True, truncation=True)
52
+ generated = self.generator.generate(**tokens, generation_config=self.generation_config)
53
+
54
+ input_emb = self.encode(input_ids=generated, attention_mask=torch.full(generated.size(), 1))
55
+ score = self.discriminator(input_emb)
56
+ score = torch.abs(score - 0.889)
57
+ index = int(torch.argmin(score))
58
+
59
+ decoded = self.tokenizer.batch_decode(generated, skip_special_tokens=True)
60
+
61
+ return decoded[index]
62
+
63
+ def answer(self, promt: str) -> str:
64
+ promt_tokens = self.tokenizer(promt, return_tensors='pt')
65
+ output = self.generator.generate(
66
+ **promt_tokens,
67
+ generation_config=self.generation_config,
68
+ )
69
+
70
+ decoded = self.tokenizer.batch_decode(output)
71
+ return decoded[0]
src/discriminator.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from transformers import PretrainedConfig, PreTrainedModel
4
+
5
+ class DiscriminatorModelConfig(PretrainedConfig):
6
+ model_type = 'descriminatormodel'
7
+ def __init__(self, **kwargs):
8
+ super().__init__(**kwargs)
9
+
10
+
11
+ class DiscriminatorModel(PreTrainedModel):
12
+ config_class = DiscriminatorModelConfig
13
+ def __init__(self, config):
14
+ super().__init__(config)
15
+ self.config = config
16
+ self.model = torch.nn.Sequential(
17
+ torch.nn.Linear(768, 512),
18
+ torch.nn.ReLU(),
19
+ torch.nn.Dropout(0.1),
20
+ torch.nn.Linear(512, 256),
21
+ torch.nn.ReLU(),
22
+ torch.nn.Dropout(0.1),
23
+ torch.nn.Linear(256, 1),
24
+ torch.nn.Dropout(0.1),
25
+ torch.nn.Sigmoid()
26
+ )
27
+ def forward(self, input):
28
+ return self.model(input)
src/utils/proccess_data.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ df = pd.read_csv('dataset.csv')['text'].values
4
+ res = [
5
+ ' '.join(txt.split(' ')[:3])
6
+ for txt in df
7
+ ]
8
+
9
+ res_df = pd.DataFrame({
10
+ 'text':
11
+ res
12
+ })
13
+
14
+ res_df.to_csv('startings.csv', index=False)
startings.csv ADDED
The diff for this file is too large to render. See raw diff