3ed0k4 commited on
Commit
65224b2
1 Parent(s): 2add513

Upload 12 files

Browse files
src/__pycache__/model.cpython-312.pyc ADDED
Binary file (2.99 kB). View file
 
src/__pycache__/train.cpython-312.pyc ADDED
Binary file (6.41 kB). View file
 
src/__pycache__/utils.cpython-312.pyc ADDED
Binary file (1.97 kB). View file
 
src/app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/app.py
2
+ from flask import Flask, request, render_template
3
+ import torch
4
+ from model import TransformerModel
5
+ from utils import load_vocab, tokenize
6
+ import time
7
+ import random
8
+ import os
9
+
10
+ app = Flask(__name__, template_folder='templates')
11
+
12
+ # Configuration
13
+ MODEL_PATH = 'models/3ed0k4_model_epoch10.pth' # Update this path based on the latest model
14
+ VOCAB_PATH = 'vocab.json'
15
+ EMBED_SIZE = 256
16
+ NUM_HEADS = 8
17
+ HIDDEN_DIM = 512
18
+ NUM_LAYERS = 4
19
+ DROPOUT = 0.1
20
+ MAX_LENGTH = 100 # Maximum tokens to generate
21
+
22
+ # Load vocabulary
23
+ vocab = load_vocab(VOCAB_PATH)
24
+ vocab_size = len(vocab)
25
+
26
+ # Initialize model
27
+ model = TransformerModel(
28
+ vocab_size=vocab_size,
29
+ embed_size=EMBED_SIZE,
30
+ num_heads=NUM_HEADS,
31
+ hidden_dim=HIDDEN_DIM,
32
+ num_layers=NUM_LAYERS,
33
+ dropout=DROPOUT
34
+ )
35
+
36
+ # Load model weights
37
+ if not os.path.exists(MODEL_PATH):
38
+ raise FileNotFoundError(f"Model file not found at {MODEL_PATH}. Please train the model first.")
39
+ model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu')))
40
+ model.eval()
41
+
42
+ def generate_text(prompt, max_length=MAX_LENGTH):
43
+ tokens = tokenize(prompt)
44
+ numericalized = [vocab.get(token, vocab['<UNK>']) for token in tokens]
45
+ input_seq = torch.tensor(numericalized, dtype=torch.long).unsqueeze(0) # Batch size 1
46
+
47
+ generated = numericalized.copy()
48
+
49
+ with torch.no_grad():
50
+ for _ in range(max_length):
51
+ src_mask = model.generate_square_subsequent_mask(input_seq.size(1)).to(input_seq.device)
52
+ outputs = model(input_seq, src_mask)
53
+ next_token_logits = outputs[0, -1, :]
54
+ next_token = torch.argmax(next_token_logits).item()
55
+
56
+ if next_token == vocab['<PAD>']:
57
+ break
58
+
59
+ generated.append(next_token)
60
+ input_seq = torch.tensor(generated, dtype=torch.long).unsqueeze(0)
61
+
62
+ # Convert numerical tokens back to words
63
+ inv_vocab = {idx: word for word, idx in vocab.items()}
64
+ generated_tokens = [inv_vocab.get(tok, '<UNK>') for tok in generated]
65
+ return ' '.join(generated_tokens)
66
+
67
+ @app.route('/', methods=['GET'])
68
+ def index():
69
+ return render_template('index.html')
70
+
71
+ @app.route('/chat', methods=['POST'])
72
+ def chat():
73
+ message = request.form.get('message')
74
+ if not message:
75
+ return render_template('index.html')
76
+
77
+ # Simulate thinking delay
78
+ delay = random.randint(1, 10)
79
+ print(f"Thinking for {delay} seconds...")
80
+ time.sleep(delay)
81
+
82
+ response = generate_text(message)
83
+ return render_template('index.html', message=message, response=response)
84
+
85
+ if __name__ == '__main__':
86
+ app.run(host='0.0.0.0', port=5000)
87
+
src/data.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # After saving processed_data.json
2
+ from utils import tokenize, build_vocab, save_vocab
3
+ from utils import load_data
4
+ import json
5
+
6
+ def prepare_training_data(processed_data, vocab_path='vocab.json'):
7
+ tokenized_texts = []
8
+ for entry in processed_data:
9
+ if isinstance(entry, str):
10
+ tokens = tokenize(entry)
11
+ tokenized_texts.append(tokens)
12
+ elif isinstance(entry, list):
13
+ for item in entry:
14
+ if isinstance(item, str):
15
+ tokens = tokenize(item)
16
+ tokenized_texts.append(tokens)
17
+ vocab = build_vocab(tokenized_texts)
18
+ save_vocab(vocab, vocab_path)
19
+ return tokenized_texts, vocab
20
+
21
+ if __name__ == "__main__":
22
+ data = load_data()
23
+ tokenized_texts, vocab = prepare_training_data(data)
24
+ # Save tokenized data
25
+ with open('data/processed/tokenized_data.json', 'w', encoding='utf-8') as f:
26
+ json.dump(tokenized_texts, f, ensure_ascii=False, indent=4)
27
+ print("Data processing complete. Tokenized data saved to data/processed/tokenized_data.json")
src/data_processing.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/data_processing.py
2
+ import os
3
+ import json
4
+ import csv
5
+ from pdfminer.high_level import extract_text
6
+ import pandas as pd
7
+ from utils import tokenize, build_vocab, save_vocab
8
+
9
+ def read_txt(file_path):
10
+ with open(file_path, 'r', encoding='utf-8') as file:
11
+ return file.read()
12
+
13
+ def read_pdf(file_path):
14
+ return extract_text(file_path)
15
+
16
+ def read_json(file_path):
17
+ with open(file_path, 'r', encoding='utf-8') as file:
18
+ return json.load(file)
19
+
20
+ def read_csv(file_path):
21
+ df = pd.read_csv(file_path)
22
+ # Concatenate all text columns into a single string
23
+ text = ' '.join(df.astype(str).values.flatten())
24
+ return text
25
+
26
+ def process_file(file_path):
27
+ _, ext = os.path.splitext(file_path)
28
+ ext = ext.lower()
29
+ if ext == '.txt':
30
+ return read_txt(file_path)
31
+ elif ext == '.pdf':
32
+ return read_pdf(file_path)
33
+ elif ext == '.json':
34
+ return read_json(file_path)
35
+ elif ext == '.csv':
36
+ return read_csv(file_path)
37
+ else:
38
+ print(f"Unsupported file format: {ext}")
39
+ return None
40
+
41
+ def load_data(raw_data_dir='data/raw'):
42
+ all_data = []
43
+ for root, dirs, files in os.walk(raw_data_dir):
44
+ for file in files:
45
+ file_path = os.path.join(root, file)
46
+ data = process_file(file_path)
47
+ if data:
48
+ all_data.append(data)
49
+ return all_data
50
+
51
+ def prepare_training_data(processed_data, vocab_path='vocab.json'):
52
+ tokenized_texts = []
53
+ for entry in processed_data:
54
+ if isinstance(entry, str):
55
+ tokens = tokenize(entry)
56
+ tokenized_texts.append(tokens)
57
+ elif isinstance(entry, list):
58
+ for item in entry:
59
+ if isinstance(item, str):
60
+ tokens = tokenize(item)
61
+ tokenized_texts.append(tokens)
62
+ vocab = build_vocab(tokenized_texts)
63
+ save_vocab(vocab, vocab_path)
64
+ return tokenized_texts, vocab
65
+
66
+ def save_tokenized_data(tokenized_texts, filepath='data/processed/tokenized_data.json'):
67
+ with open(filepath, 'w', encoding='utf-8') as f:
68
+ json.dump(tokenized_texts, f, ensure_ascii=False, indent=4)
69
+
70
+ def save_processed_data(processed_data, filepath='data/processed/processed_data.json'):
71
+ with open(filepath, 'w', encoding='utf-8') as f:
72
+ json.dump(processed_data, f, ensure_ascii=False, indent=4)
73
+
74
+ if __name__ == "__main__":
75
+ print("Loading raw data...")
76
+ data = load_data()
77
+ print(f"Loaded {len(data)} data entries.")
78
+
79
+ print("Preparing training data...")
80
+ tokenized_texts, vocab = prepare_training_data(data)
81
+ save_tokenized_data(tokenized_texts)
82
+ save_processed_data(data)
83
+ print("Data processing complete.")
84
+ print(f"Vocabulary size: {len(vocab)}")
src/evaluate.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/evaluate.py
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.utils.data import Dataset, DataLoader
5
+ import json
6
+ from model import TransformerModel
7
+ from utils import load_vocab
8
+ from tqdm import tqdm
9
+ import os
10
+
11
+ class TextDataset(Dataset):
12
+ def __init__(self, data_path, vocab, seq_length=50):
13
+ with open(data_path, 'r', encoding='utf-8') as f:
14
+ self.data = json.load(f)
15
+ self.vocab = vocab
16
+ self.seq_length = seq_length
17
+
18
+ def __len__(self):
19
+ return len(self.data)
20
+
21
+ def numericalize(self, tokens):
22
+ return [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]
23
+
24
+ def __getitem__(self, idx):
25
+ tokens = self.data[idx]
26
+ numericalized = self.numericalize(tokens)
27
+ if len(numericalized) < self.seq_length + 1:
28
+ numericalized += [self.vocab['<PAD>']] * (self.seq_length + 1 - len(numericalized))
29
+ else:
30
+ numericalized = numericalized[:self.seq_length + 1]
31
+ input_seq = torch.tensor(numericalized[:-1], dtype=torch.long)
32
+ target_seq = torch.tensor(numericalized[1:], dtype=torch.long)
33
+ return input_seq, target_seq
34
+
35
+ def collate_fn(batch):
36
+ inputs, targets = zip(*batch)
37
+ inputs = torch.stack(inputs)
38
+ targets = torch.stack(targets)
39
+ return inputs, targets
40
+
41
+ def get_dataloader(data_path, vocab, batch_size=64, seq_length=50):
42
+ dataset = TextDataset(data_path, vocab, seq_length)
43
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
44
+ return dataloader
45
+
46
+ def evaluate_model(config):
47
+ # Load vocabulary
48
+ vocab = load_vocab(config['vocab_path'])
49
+ vocab_size = len(vocab)
50
+
51
+ # Initialize model
52
+ model = TransformerModel(
53
+ vocab_size=vocab_size,
54
+ embed_size=config['embed_size'],
55
+ num_heads=config['num_heads'],
56
+ hidden_dim=config['hidden_dim'],
57
+ num_layers=config['num_layers'],
58
+ dropout=config['dropout']
59
+ )
60
+
61
+ # Load model weights
62
+ model.load_state_dict(torch.load(config['model_path'], map_location=torch.device('cpu')))
63
+ model.eval()
64
+
65
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
66
+ model = model.to(device)
67
+
68
+ # Loss function
69
+ criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
70
+
71
+ # DataLoader
72
+ dataloader = get_dataloader(
73
+ data_path=config['data_path'],
74
+ vocab=vocab,
75
+ batch_size=config['batch_size'],
76
+ seq_length=config['seq_length']
77
+ )
78
+
79
+ total_loss = 0
80
+ total_tokens = 0
81
+
82
+ with torch.no_grad():
83
+ for inputs, targets in tqdm(dataloader, desc="Evaluating"):
84
+ inputs = inputs.to(device)
85
+ targets = targets.to(device)
86
+
87
+ src_mask = model.generate_square_subsequent_mask(inputs.size(1)).to(device)
88
+ outputs = model(inputs, src_mask)
89
+ loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
90
+ total_loss += loss.item() * inputs.size(0)
91
+ total_tokens += inputs.size(0)
92
+
93
+ average_loss = total_loss / total_tokens
94
+ perplexity = torch.exp(torch.tensor(average_loss))
95
+ print(f"Average Loss: {average_loss:.4f}")
96
+ print(f"Perplexity: {perplexity:.4f}")
97
+
98
+ if __name__ == "__main__":
99
+ config = {
100
+ 'vocab_path': 'vocab.json',
101
+ 'data_path': 'data/processed/tokenized_data.json',
102
+ 'model_path': 'models/3ed0k4_model_epoch10.pth', # Update accordingly
103
+ 'embed_size': 256,
104
+ 'num_heads': 8,
105
+ 'hidden_dim': 512,
106
+ 'num_layers': 4,
107
+ 'dropout': 0.1,
108
+ 'batch_size': 64,
109
+ 'seq_length': 50,
110
+ }
111
+ evaluate_model(config)
src/model.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/model.py
2
+ import torch
3
+ import torch.nn as nn
4
+
5
+ class TransformerModel(nn.Module):
6
+ def __init__(self, vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout=0.1):
7
+ super(TransformerModel, self).__init__()
8
+ self.embed_size = embed_size
9
+ self.token_embedding = nn.Embedding(vocab_size, embed_size)
10
+ self.position_embedding = nn.Embedding(5000, embed_size) # Max sequence length
11
+
12
+ encoder_layers = nn.TransformerEncoderLayer(
13
+ d_model=embed_size,
14
+ nhead=num_heads,
15
+ dim_feedforward=hidden_dim,
16
+ dropout=dropout
17
+ )
18
+ self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
19
+
20
+ self.fc_out = nn.Linear(embed_size, vocab_size)
21
+ self.dropout = nn.Dropout(dropout)
22
+
23
+ def forward(self, src, src_mask):
24
+ batch_size, seq_length = src.size()
25
+ positions = torch.arange(0, seq_length).unsqueeze(0).repeat(batch_size, 1).to(src.device)
26
+
27
+ x = self.token_embedding(src) + self.position_embedding(positions)
28
+ x = self.dropout(x)
29
+ x = x.permute(1, 0, 2) # Transformer expects [seq_length, batch_size, embed_size]
30
+ transformer_out = self.transformer_encoder(x, src_mask)
31
+ transformer_out = transformer_out.permute(1, 0, 2)
32
+ logits = self.fc_out(transformer_out)
33
+ return logits
34
+
35
+ def generate_square_subsequent_mask(self, sz):
36
+ mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
37
+ mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
38
+ return mask
src/templates/index.html ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Chatbot</title>
7
+ <style>
8
+ body {
9
+ font-family: Arial, sans-serif;
10
+ text-align: center;
11
+ }
12
+
13
+ .container {
14
+ width: 50%;
15
+ margin: 0 auto;
16
+ }
17
+
18
+ .chat-container {
19
+ display: flex;
20
+ flex-direction: column;
21
+ align-items: center;
22
+ padding: 20px;
23
+ border: 1px solid #ddd;
24
+ border-radius: 10px;
25
+ margin-bottom: 20px;
26
+ }
27
+
28
+ .chat-message {
29
+ margin-bottom: 10px;
30
+ }
31
+
32
+ .chat-message:first-child {
33
+ margin-top: 0;
34
+ }
35
+
36
+ .chat-message:last-child {
37
+ margin-bottom: 0;
38
+ }
39
+
40
+ .chat-user {
41
+ font-weight: bold;
42
+ }
43
+
44
+ .chat-bot {
45
+ color: #666;
46
+ }
47
+ </style>
48
+ </head>
49
+ <body>
50
+ <div class="container">
51
+ <h1>Chatbot</h1>
52
+ <form action="/chat" method="POST">
53
+ <input type="text" name="message" placeholder="Type a message..." />
54
+ <button type="submit">Send</button>
55
+ </form>
56
+
57
+ <div class="chat-container">
58
+ {% if message %}
59
+ <div class="chat-message">
60
+ <span class="chat-user">You:</span>
61
+ {{ message }}
62
+ </div>
63
+ {% endif %}
64
+
65
+ {% if response %}
66
+ <div class="chat-message">
67
+ <span class="chat-bot">Bot:</span>
68
+ {{ response }}
69
+ </div>
70
+ {% endif %}
71
+ </div>
72
+ </div>
73
+ </body>
74
+ </html>
src/train.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/train.py
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.optim as optim
5
+ from torch.utils.data import Dataset, DataLoader
6
+ import json
7
+ from model import TransformerModel
8
+ from utils import load_vocab, tokenize
9
+ from tqdm import tqdm
10
+ import os
11
+ import subprocess
12
+
13
+ class TextDataset(Dataset):
14
+ def __init__(self, data_path, vocab, seq_length=50):
15
+ with open(data_path, 'r', encoding='utf-8') as f:
16
+ self.data = json.load(f)
17
+ self.vocab = vocab
18
+ self.seq_length = seq_length
19
+
20
+ def __len__(self):
21
+ return len(self.data)
22
+
23
+ def numericalize(self, tokens):
24
+ return [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]
25
+
26
+ def __getitem__(self, idx):
27
+ tokens = self.data[idx]
28
+ numericalized = self.numericalize(tokens)
29
+ if len(numericalized) < self.seq_length + 1:
30
+ numericalized += [self.vocab['<PAD>']] * (self.seq_length + 1 - len(numericalized))
31
+ else:
32
+ numericalized = numericalized[:self.seq_length + 1]
33
+ input_seq = torch.tensor(numericalized[:-1], dtype=torch.long)
34
+ target_seq = torch.tensor(numericalized[1:], dtype=torch.long)
35
+ return input_seq, target_seq
36
+
37
+ def collate_fn(batch):
38
+ inputs, targets = zip(*batch)
39
+ inputs = torch.stack(inputs)
40
+ targets = torch.stack(targets)
41
+ return inputs, targets
42
+
43
+ def get_dataloader(data_path, vocab, batch_size=64, seq_length=50):
44
+ dataset = TextDataset(data_path, vocab, seq_length)
45
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
46
+ return dataloader
47
+
48
+ def train_model(config):
49
+ # Check if vocab.json exists
50
+ if not os.path.exists(config['vocab_path']):
51
+ print("vocab.json not found. Running data_processing.py...")
52
+ subprocess.run(['python', 'src/data_processing.py'], check=True)
53
+
54
+ # Load vocabulary
55
+ vocab = load_vocab(config['vocab_path'])
56
+ vocab_size = len(vocab)
57
+
58
+ # Initialize model
59
+ model = TransformerModel(
60
+ vocab_size=vocab_size,
61
+ embed_size=config['embed_size'],
62
+ num_heads=config['num_heads'],
63
+ hidden_dim=config['hidden_dim'],
64
+ num_layers=config['num_layers'],
65
+ dropout=config['dropout']
66
+ )
67
+
68
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
69
+ model = model.to(device)
70
+
71
+ # Loss and optimizer
72
+ criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
73
+ optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
74
+
75
+ # DataLoader
76
+ dataloader = get_dataloader(
77
+ data_path=config['data_path'],
78
+ vocab=vocab,
79
+ batch_size=config['batch_size'],
80
+ seq_length=config['seq_length']
81
+ )
82
+
83
+ # Training loop
84
+ model.train()
85
+ for epoch in range(1, config['epochs'] + 1):
86
+ epoch_loss = 0
87
+ progress = tqdm(dataloader, desc=f"Epoch {epoch}/{config['epochs']}")
88
+ for inputs, targets in progress:
89
+ inputs = inputs.to(device)
90
+ targets = targets.to(device)
91
+
92
+ optimizer.zero_grad()
93
+ src_mask = model.generate_square_subsequent_mask(inputs.size(1)).to(device)
94
+ outputs = model(inputs, src_mask)
95
+ loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
96
+ loss.backward()
97
+ optimizer.step()
98
+
99
+ epoch_loss += loss.item()
100
+ progress.set_postfix(loss=loss.item())
101
+ avg_loss = epoch_loss / len(dataloader)
102
+ print(f"Epoch {epoch} completed. Average Loss: {avg_loss:.4f}")
103
+
104
+ # Save model after each epoch
105
+ os.makedirs('models', exist_ok=True)
106
+ torch.save(model.state_dict(), f"models/3ed0k4_model_epoch{epoch}.pth")
107
+ print(f"Model saved at models/3ed0k4_model_epoch{epoch}.pth")
108
+
109
+ if __name__ == "__main__":
110
+ config = {
111
+ 'vocab_path': 'vocab.json',
112
+ 'data_path': 'data/processed/tokenized_data.json',
113
+ 'embed_size': 256,
114
+ 'num_heads': 8,
115
+ 'hidden_dim': 512,
116
+ 'num_layers': 4,
117
+ 'dropout': 0.1,
118
+ 'learning_rate': 0.001,
119
+ 'batch_size': 64,
120
+ 'seq_length': 50,
121
+ 'epochs': 10
122
+ }
123
+ train_model(config)
src/upload_to_hf.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/upload_to_hf.py
2
+ from transformers import PreTrainedTokenizerFast
3
+ import torch
4
+ from model import TransformerModel
5
+ from utils import load_vocab
6
+ import json
7
+ import os
8
+
9
+ # Configuration
10
+ MODEL_PATH = 'models/3ed0k4_model_epoch10.pth' # Update this path
11
+ VOCAB_PATH = 'vocab.json'
12
+ TOKENIZER_DIR = 'tokenizer'
13
+ HF_MODEL_REPO = '3ed0k4/3ed0k4' # Replace with your Hugging Face repo
14
+
15
+ # Initialize tokenizer
16
+ def init_tokenizer(vocab):
17
+ tokenizer = PreTrainedTokenizerFast(tokenizer_file=None)
18
+ tokenizer.add_tokens(list(vocab.keys()))
19
+ tokenizer.save_pretrained(TOKENIZER_DIR)
20
+ print(f"Tokenizer saved to {TOKENIZER_DIR}/")
21
+
22
+ # Prepare model
23
+ def prepare_model(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout, model_path):
24
+ model = TransformerModel(
25
+ vocab_size=vocab_size,
26
+ embed_size=embed_size,
27
+ num_heads=num_heads,
28
+ hidden_dim=hidden_dim,
29
+ num_layers=num_layers,
30
+ dropout=dropout
31
+ )
32
+ model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
33
+ model.eval()
34
+ # Save model
35
+ model.save_pretrained('.') # Saves state_dict; Hugging Face expects more
36
+ torch.save(model.state_dict(), 'pytorch_model.bin')
37
+ print("Model weights saved as pytorch_model.bin")
38
+
39
+ # Create config.json
40
+ def create_config(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout):
41
+ config = {
42
+ "vocab_size": vocab_size,
43
+ "embed_size": embed_size,
44
+ "num_heads": num_heads,
45
+ "hidden_dim": hidden_dim,
46
+ "num_layers": num_layers,
47
+ "dropout": dropout
48
+ }
49
+ with open('config.json', 'w') as f:
50
+ json.dump(config, f, indent=4)
51
+ print("Config saved as config.json")
52
+
53
+ if __name__ == "__main__":
54
+ # Load vocabulary
55
+ vocab = load_vocab(VOCAB_PATH)
56
+ vocab_size = len(vocab)
57
+
58
+ # Initialize tokenizer
59
+ init_tokenizer(vocab)
60
+
61
+ # Model parameters
62
+ embed_size = 256
63
+ num_heads = 8
64
+ hidden_dim = 512
65
+ num_layers = 4
66
+ dropout = 0.1
67
+
68
+ # Prepare and save model
69
+ prepare_model(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout, MODEL_PATH)
70
+
71
+ # Create config.json
72
+ create_config(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout)
73
+
74
+ print("Model preparation for Hugging Face completed.")
src/utils.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/utils.py
2
+ import re
3
+ from collections import Counter
4
+ import json
5
+
6
+ def tokenize(text):
7
+ """
8
+ Simple tokenizer that splits text into tokens based on whitespace and punctuation.
9
+ """
10
+ tokens = re.findall(r'\b\w+\b', text.lower())
11
+ return tokens
12
+
13
+ def build_vocab(tokenized_texts, min_freq=2):
14
+ """
15
+ Builds a vocabulary dictionary from tokenized texts.
16
+ Tokens appearing fewer than `min_freq` times are excluded.
17
+ """
18
+ counter = Counter()
19
+ for tokens in tokenized_texts:
20
+ counter.update(tokens)
21
+ vocab = {'<PAD>': 0, '<UNK>': 1}
22
+ for word, freq in counter.items():
23
+ if freq >= min_freq:
24
+ vocab[word] = len(vocab)
25
+ return vocab
26
+
27
+ def save_vocab(vocab, filepath='vocab.json'):
28
+ """
29
+ Saves the vocabulary dictionary to a JSON file.
30
+ """
31
+ with open(filepath, 'w', encoding='utf-8') as f:
32
+ json.dump(vocab, f, ensure_ascii=False, indent=4)
33
+
34
+ def load_vocab(filepath='vocab.json'):
35
+ """
36
+ Loads the vocabulary dictionary from a JSON file.
37
+ """
38
+ with open(filepath, 'r', encoding='utf-8') as f:
39
+ return json.load(f)