Spaces:
Running
Running
Upload 12 files
Browse files- src/__pycache__/model.cpython-312.pyc +0 -0
- src/__pycache__/train.cpython-312.pyc +0 -0
- src/__pycache__/utils.cpython-312.pyc +0 -0
- src/app.py +87 -0
- src/data.py +27 -0
- src/data_processing.py +84 -0
- src/evaluate.py +111 -0
- src/model.py +38 -0
- src/templates/index.html +74 -0
- src/train.py +123 -0
- src/upload_to_hf.py +74 -0
- src/utils.py +39 -0
src/__pycache__/model.cpython-312.pyc
ADDED
Binary file (2.99 kB). View file
|
|
src/__pycache__/train.cpython-312.pyc
ADDED
Binary file (6.41 kB). View file
|
|
src/__pycache__/utils.cpython-312.pyc
ADDED
Binary file (1.97 kB). View file
|
|
src/app.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/app.py
|
2 |
+
from flask import Flask, request, render_template
|
3 |
+
import torch
|
4 |
+
from model import TransformerModel
|
5 |
+
from utils import load_vocab, tokenize
|
6 |
+
import time
|
7 |
+
import random
|
8 |
+
import os
|
9 |
+
|
10 |
+
app = Flask(__name__, template_folder='templates')
|
11 |
+
|
12 |
+
# Configuration
|
13 |
+
MODEL_PATH = 'models/3ed0k4_model_epoch10.pth' # Update this path based on the latest model
|
14 |
+
VOCAB_PATH = 'vocab.json'
|
15 |
+
EMBED_SIZE = 256
|
16 |
+
NUM_HEADS = 8
|
17 |
+
HIDDEN_DIM = 512
|
18 |
+
NUM_LAYERS = 4
|
19 |
+
DROPOUT = 0.1
|
20 |
+
MAX_LENGTH = 100 # Maximum tokens to generate
|
21 |
+
|
22 |
+
# Load vocabulary
|
23 |
+
vocab = load_vocab(VOCAB_PATH)
|
24 |
+
vocab_size = len(vocab)
|
25 |
+
|
26 |
+
# Initialize model
|
27 |
+
model = TransformerModel(
|
28 |
+
vocab_size=vocab_size,
|
29 |
+
embed_size=EMBED_SIZE,
|
30 |
+
num_heads=NUM_HEADS,
|
31 |
+
hidden_dim=HIDDEN_DIM,
|
32 |
+
num_layers=NUM_LAYERS,
|
33 |
+
dropout=DROPOUT
|
34 |
+
)
|
35 |
+
|
36 |
+
# Load model weights
|
37 |
+
if not os.path.exists(MODEL_PATH):
|
38 |
+
raise FileNotFoundError(f"Model file not found at {MODEL_PATH}. Please train the model first.")
|
39 |
+
model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu')))
|
40 |
+
model.eval()
|
41 |
+
|
42 |
+
def generate_text(prompt, max_length=MAX_LENGTH):
|
43 |
+
tokens = tokenize(prompt)
|
44 |
+
numericalized = [vocab.get(token, vocab['<UNK>']) for token in tokens]
|
45 |
+
input_seq = torch.tensor(numericalized, dtype=torch.long).unsqueeze(0) # Batch size 1
|
46 |
+
|
47 |
+
generated = numericalized.copy()
|
48 |
+
|
49 |
+
with torch.no_grad():
|
50 |
+
for _ in range(max_length):
|
51 |
+
src_mask = model.generate_square_subsequent_mask(input_seq.size(1)).to(input_seq.device)
|
52 |
+
outputs = model(input_seq, src_mask)
|
53 |
+
next_token_logits = outputs[0, -1, :]
|
54 |
+
next_token = torch.argmax(next_token_logits).item()
|
55 |
+
|
56 |
+
if next_token == vocab['<PAD>']:
|
57 |
+
break
|
58 |
+
|
59 |
+
generated.append(next_token)
|
60 |
+
input_seq = torch.tensor(generated, dtype=torch.long).unsqueeze(0)
|
61 |
+
|
62 |
+
# Convert numerical tokens back to words
|
63 |
+
inv_vocab = {idx: word for word, idx in vocab.items()}
|
64 |
+
generated_tokens = [inv_vocab.get(tok, '<UNK>') for tok in generated]
|
65 |
+
return ' '.join(generated_tokens)
|
66 |
+
|
67 |
+
@app.route('/', methods=['GET'])
|
68 |
+
def index():
|
69 |
+
return render_template('index.html')
|
70 |
+
|
71 |
+
@app.route('/chat', methods=['POST'])
|
72 |
+
def chat():
|
73 |
+
message = request.form.get('message')
|
74 |
+
if not message:
|
75 |
+
return render_template('index.html')
|
76 |
+
|
77 |
+
# Simulate thinking delay
|
78 |
+
delay = random.randint(1, 10)
|
79 |
+
print(f"Thinking for {delay} seconds...")
|
80 |
+
time.sleep(delay)
|
81 |
+
|
82 |
+
response = generate_text(message)
|
83 |
+
return render_template('index.html', message=message, response=response)
|
84 |
+
|
85 |
+
if __name__ == '__main__':
|
86 |
+
app.run(host='0.0.0.0', port=5000)
|
87 |
+
|
src/data.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# After saving processed_data.json
|
2 |
+
from utils import tokenize, build_vocab, save_vocab
|
3 |
+
from utils import load_data
|
4 |
+
import json
|
5 |
+
|
6 |
+
def prepare_training_data(processed_data, vocab_path='vocab.json'):
|
7 |
+
tokenized_texts = []
|
8 |
+
for entry in processed_data:
|
9 |
+
if isinstance(entry, str):
|
10 |
+
tokens = tokenize(entry)
|
11 |
+
tokenized_texts.append(tokens)
|
12 |
+
elif isinstance(entry, list):
|
13 |
+
for item in entry:
|
14 |
+
if isinstance(item, str):
|
15 |
+
tokens = tokenize(item)
|
16 |
+
tokenized_texts.append(tokens)
|
17 |
+
vocab = build_vocab(tokenized_texts)
|
18 |
+
save_vocab(vocab, vocab_path)
|
19 |
+
return tokenized_texts, vocab
|
20 |
+
|
21 |
+
if __name__ == "__main__":
|
22 |
+
data = load_data()
|
23 |
+
tokenized_texts, vocab = prepare_training_data(data)
|
24 |
+
# Save tokenized data
|
25 |
+
with open('data/processed/tokenized_data.json', 'w', encoding='utf-8') as f:
|
26 |
+
json.dump(tokenized_texts, f, ensure_ascii=False, indent=4)
|
27 |
+
print("Data processing complete. Tokenized data saved to data/processed/tokenized_data.json")
|
src/data_processing.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/data_processing.py
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import csv
|
5 |
+
from pdfminer.high_level import extract_text
|
6 |
+
import pandas as pd
|
7 |
+
from utils import tokenize, build_vocab, save_vocab
|
8 |
+
|
9 |
+
def read_txt(file_path):
|
10 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
11 |
+
return file.read()
|
12 |
+
|
13 |
+
def read_pdf(file_path):
|
14 |
+
return extract_text(file_path)
|
15 |
+
|
16 |
+
def read_json(file_path):
|
17 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
18 |
+
return json.load(file)
|
19 |
+
|
20 |
+
def read_csv(file_path):
|
21 |
+
df = pd.read_csv(file_path)
|
22 |
+
# Concatenate all text columns into a single string
|
23 |
+
text = ' '.join(df.astype(str).values.flatten())
|
24 |
+
return text
|
25 |
+
|
26 |
+
def process_file(file_path):
|
27 |
+
_, ext = os.path.splitext(file_path)
|
28 |
+
ext = ext.lower()
|
29 |
+
if ext == '.txt':
|
30 |
+
return read_txt(file_path)
|
31 |
+
elif ext == '.pdf':
|
32 |
+
return read_pdf(file_path)
|
33 |
+
elif ext == '.json':
|
34 |
+
return read_json(file_path)
|
35 |
+
elif ext == '.csv':
|
36 |
+
return read_csv(file_path)
|
37 |
+
else:
|
38 |
+
print(f"Unsupported file format: {ext}")
|
39 |
+
return None
|
40 |
+
|
41 |
+
def load_data(raw_data_dir='data/raw'):
|
42 |
+
all_data = []
|
43 |
+
for root, dirs, files in os.walk(raw_data_dir):
|
44 |
+
for file in files:
|
45 |
+
file_path = os.path.join(root, file)
|
46 |
+
data = process_file(file_path)
|
47 |
+
if data:
|
48 |
+
all_data.append(data)
|
49 |
+
return all_data
|
50 |
+
|
51 |
+
def prepare_training_data(processed_data, vocab_path='vocab.json'):
|
52 |
+
tokenized_texts = []
|
53 |
+
for entry in processed_data:
|
54 |
+
if isinstance(entry, str):
|
55 |
+
tokens = tokenize(entry)
|
56 |
+
tokenized_texts.append(tokens)
|
57 |
+
elif isinstance(entry, list):
|
58 |
+
for item in entry:
|
59 |
+
if isinstance(item, str):
|
60 |
+
tokens = tokenize(item)
|
61 |
+
tokenized_texts.append(tokens)
|
62 |
+
vocab = build_vocab(tokenized_texts)
|
63 |
+
save_vocab(vocab, vocab_path)
|
64 |
+
return tokenized_texts, vocab
|
65 |
+
|
66 |
+
def save_tokenized_data(tokenized_texts, filepath='data/processed/tokenized_data.json'):
|
67 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
68 |
+
json.dump(tokenized_texts, f, ensure_ascii=False, indent=4)
|
69 |
+
|
70 |
+
def save_processed_data(processed_data, filepath='data/processed/processed_data.json'):
|
71 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
72 |
+
json.dump(processed_data, f, ensure_ascii=False, indent=4)
|
73 |
+
|
74 |
+
if __name__ == "__main__":
|
75 |
+
print("Loading raw data...")
|
76 |
+
data = load_data()
|
77 |
+
print(f"Loaded {len(data)} data entries.")
|
78 |
+
|
79 |
+
print("Preparing training data...")
|
80 |
+
tokenized_texts, vocab = prepare_training_data(data)
|
81 |
+
save_tokenized_data(tokenized_texts)
|
82 |
+
save_processed_data(data)
|
83 |
+
print("Data processing complete.")
|
84 |
+
print(f"Vocabulary size: {len(vocab)}")
|
src/evaluate.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/evaluate.py
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch.utils.data import Dataset, DataLoader
|
5 |
+
import json
|
6 |
+
from model import TransformerModel
|
7 |
+
from utils import load_vocab
|
8 |
+
from tqdm import tqdm
|
9 |
+
import os
|
10 |
+
|
11 |
+
class TextDataset(Dataset):
|
12 |
+
def __init__(self, data_path, vocab, seq_length=50):
|
13 |
+
with open(data_path, 'r', encoding='utf-8') as f:
|
14 |
+
self.data = json.load(f)
|
15 |
+
self.vocab = vocab
|
16 |
+
self.seq_length = seq_length
|
17 |
+
|
18 |
+
def __len__(self):
|
19 |
+
return len(self.data)
|
20 |
+
|
21 |
+
def numericalize(self, tokens):
|
22 |
+
return [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]
|
23 |
+
|
24 |
+
def __getitem__(self, idx):
|
25 |
+
tokens = self.data[idx]
|
26 |
+
numericalized = self.numericalize(tokens)
|
27 |
+
if len(numericalized) < self.seq_length + 1:
|
28 |
+
numericalized += [self.vocab['<PAD>']] * (self.seq_length + 1 - len(numericalized))
|
29 |
+
else:
|
30 |
+
numericalized = numericalized[:self.seq_length + 1]
|
31 |
+
input_seq = torch.tensor(numericalized[:-1], dtype=torch.long)
|
32 |
+
target_seq = torch.tensor(numericalized[1:], dtype=torch.long)
|
33 |
+
return input_seq, target_seq
|
34 |
+
|
35 |
+
def collate_fn(batch):
|
36 |
+
inputs, targets = zip(*batch)
|
37 |
+
inputs = torch.stack(inputs)
|
38 |
+
targets = torch.stack(targets)
|
39 |
+
return inputs, targets
|
40 |
+
|
41 |
+
def get_dataloader(data_path, vocab, batch_size=64, seq_length=50):
|
42 |
+
dataset = TextDataset(data_path, vocab, seq_length)
|
43 |
+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
|
44 |
+
return dataloader
|
45 |
+
|
46 |
+
def evaluate_model(config):
|
47 |
+
# Load vocabulary
|
48 |
+
vocab = load_vocab(config['vocab_path'])
|
49 |
+
vocab_size = len(vocab)
|
50 |
+
|
51 |
+
# Initialize model
|
52 |
+
model = TransformerModel(
|
53 |
+
vocab_size=vocab_size,
|
54 |
+
embed_size=config['embed_size'],
|
55 |
+
num_heads=config['num_heads'],
|
56 |
+
hidden_dim=config['hidden_dim'],
|
57 |
+
num_layers=config['num_layers'],
|
58 |
+
dropout=config['dropout']
|
59 |
+
)
|
60 |
+
|
61 |
+
# Load model weights
|
62 |
+
model.load_state_dict(torch.load(config['model_path'], map_location=torch.device('cpu')))
|
63 |
+
model.eval()
|
64 |
+
|
65 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
66 |
+
model = model.to(device)
|
67 |
+
|
68 |
+
# Loss function
|
69 |
+
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
|
70 |
+
|
71 |
+
# DataLoader
|
72 |
+
dataloader = get_dataloader(
|
73 |
+
data_path=config['data_path'],
|
74 |
+
vocab=vocab,
|
75 |
+
batch_size=config['batch_size'],
|
76 |
+
seq_length=config['seq_length']
|
77 |
+
)
|
78 |
+
|
79 |
+
total_loss = 0
|
80 |
+
total_tokens = 0
|
81 |
+
|
82 |
+
with torch.no_grad():
|
83 |
+
for inputs, targets in tqdm(dataloader, desc="Evaluating"):
|
84 |
+
inputs = inputs.to(device)
|
85 |
+
targets = targets.to(device)
|
86 |
+
|
87 |
+
src_mask = model.generate_square_subsequent_mask(inputs.size(1)).to(device)
|
88 |
+
outputs = model(inputs, src_mask)
|
89 |
+
loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
|
90 |
+
total_loss += loss.item() * inputs.size(0)
|
91 |
+
total_tokens += inputs.size(0)
|
92 |
+
|
93 |
+
average_loss = total_loss / total_tokens
|
94 |
+
perplexity = torch.exp(torch.tensor(average_loss))
|
95 |
+
print(f"Average Loss: {average_loss:.4f}")
|
96 |
+
print(f"Perplexity: {perplexity:.4f}")
|
97 |
+
|
98 |
+
if __name__ == "__main__":
|
99 |
+
config = {
|
100 |
+
'vocab_path': 'vocab.json',
|
101 |
+
'data_path': 'data/processed/tokenized_data.json',
|
102 |
+
'model_path': 'models/3ed0k4_model_epoch10.pth', # Update accordingly
|
103 |
+
'embed_size': 256,
|
104 |
+
'num_heads': 8,
|
105 |
+
'hidden_dim': 512,
|
106 |
+
'num_layers': 4,
|
107 |
+
'dropout': 0.1,
|
108 |
+
'batch_size': 64,
|
109 |
+
'seq_length': 50,
|
110 |
+
}
|
111 |
+
evaluate_model(config)
|
src/model.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/model.py
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
|
5 |
+
class TransformerModel(nn.Module):
|
6 |
+
def __init__(self, vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout=0.1):
|
7 |
+
super(TransformerModel, self).__init__()
|
8 |
+
self.embed_size = embed_size
|
9 |
+
self.token_embedding = nn.Embedding(vocab_size, embed_size)
|
10 |
+
self.position_embedding = nn.Embedding(5000, embed_size) # Max sequence length
|
11 |
+
|
12 |
+
encoder_layers = nn.TransformerEncoderLayer(
|
13 |
+
d_model=embed_size,
|
14 |
+
nhead=num_heads,
|
15 |
+
dim_feedforward=hidden_dim,
|
16 |
+
dropout=dropout
|
17 |
+
)
|
18 |
+
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
|
19 |
+
|
20 |
+
self.fc_out = nn.Linear(embed_size, vocab_size)
|
21 |
+
self.dropout = nn.Dropout(dropout)
|
22 |
+
|
23 |
+
def forward(self, src, src_mask):
|
24 |
+
batch_size, seq_length = src.size()
|
25 |
+
positions = torch.arange(0, seq_length).unsqueeze(0).repeat(batch_size, 1).to(src.device)
|
26 |
+
|
27 |
+
x = self.token_embedding(src) + self.position_embedding(positions)
|
28 |
+
x = self.dropout(x)
|
29 |
+
x = x.permute(1, 0, 2) # Transformer expects [seq_length, batch_size, embed_size]
|
30 |
+
transformer_out = self.transformer_encoder(x, src_mask)
|
31 |
+
transformer_out = transformer_out.permute(1, 0, 2)
|
32 |
+
logits = self.fc_out(transformer_out)
|
33 |
+
return logits
|
34 |
+
|
35 |
+
def generate_square_subsequent_mask(self, sz):
|
36 |
+
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
|
37 |
+
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
|
38 |
+
return mask
|
src/templates/index.html
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Chatbot</title>
|
7 |
+
<style>
|
8 |
+
body {
|
9 |
+
font-family: Arial, sans-serif;
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
|
13 |
+
.container {
|
14 |
+
width: 50%;
|
15 |
+
margin: 0 auto;
|
16 |
+
}
|
17 |
+
|
18 |
+
.chat-container {
|
19 |
+
display: flex;
|
20 |
+
flex-direction: column;
|
21 |
+
align-items: center;
|
22 |
+
padding: 20px;
|
23 |
+
border: 1px solid #ddd;
|
24 |
+
border-radius: 10px;
|
25 |
+
margin-bottom: 20px;
|
26 |
+
}
|
27 |
+
|
28 |
+
.chat-message {
|
29 |
+
margin-bottom: 10px;
|
30 |
+
}
|
31 |
+
|
32 |
+
.chat-message:first-child {
|
33 |
+
margin-top: 0;
|
34 |
+
}
|
35 |
+
|
36 |
+
.chat-message:last-child {
|
37 |
+
margin-bottom: 0;
|
38 |
+
}
|
39 |
+
|
40 |
+
.chat-user {
|
41 |
+
font-weight: bold;
|
42 |
+
}
|
43 |
+
|
44 |
+
.chat-bot {
|
45 |
+
color: #666;
|
46 |
+
}
|
47 |
+
</style>
|
48 |
+
</head>
|
49 |
+
<body>
|
50 |
+
<div class="container">
|
51 |
+
<h1>Chatbot</h1>
|
52 |
+
<form action="/chat" method="POST">
|
53 |
+
<input type="text" name="message" placeholder="Type a message..." />
|
54 |
+
<button type="submit">Send</button>
|
55 |
+
</form>
|
56 |
+
|
57 |
+
<div class="chat-container">
|
58 |
+
{% if message %}
|
59 |
+
<div class="chat-message">
|
60 |
+
<span class="chat-user">You:</span>
|
61 |
+
{{ message }}
|
62 |
+
</div>
|
63 |
+
{% endif %}
|
64 |
+
|
65 |
+
{% if response %}
|
66 |
+
<div class="chat-message">
|
67 |
+
<span class="chat-bot">Bot:</span>
|
68 |
+
{{ response }}
|
69 |
+
</div>
|
70 |
+
{% endif %}
|
71 |
+
</div>
|
72 |
+
</div>
|
73 |
+
</body>
|
74 |
+
</html>
|
src/train.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/train.py
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.optim as optim
|
5 |
+
from torch.utils.data import Dataset, DataLoader
|
6 |
+
import json
|
7 |
+
from model import TransformerModel
|
8 |
+
from utils import load_vocab, tokenize
|
9 |
+
from tqdm import tqdm
|
10 |
+
import os
|
11 |
+
import subprocess
|
12 |
+
|
13 |
+
class TextDataset(Dataset):
|
14 |
+
def __init__(self, data_path, vocab, seq_length=50):
|
15 |
+
with open(data_path, 'r', encoding='utf-8') as f:
|
16 |
+
self.data = json.load(f)
|
17 |
+
self.vocab = vocab
|
18 |
+
self.seq_length = seq_length
|
19 |
+
|
20 |
+
def __len__(self):
|
21 |
+
return len(self.data)
|
22 |
+
|
23 |
+
def numericalize(self, tokens):
|
24 |
+
return [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]
|
25 |
+
|
26 |
+
def __getitem__(self, idx):
|
27 |
+
tokens = self.data[idx]
|
28 |
+
numericalized = self.numericalize(tokens)
|
29 |
+
if len(numericalized) < self.seq_length + 1:
|
30 |
+
numericalized += [self.vocab['<PAD>']] * (self.seq_length + 1 - len(numericalized))
|
31 |
+
else:
|
32 |
+
numericalized = numericalized[:self.seq_length + 1]
|
33 |
+
input_seq = torch.tensor(numericalized[:-1], dtype=torch.long)
|
34 |
+
target_seq = torch.tensor(numericalized[1:], dtype=torch.long)
|
35 |
+
return input_seq, target_seq
|
36 |
+
|
37 |
+
def collate_fn(batch):
|
38 |
+
inputs, targets = zip(*batch)
|
39 |
+
inputs = torch.stack(inputs)
|
40 |
+
targets = torch.stack(targets)
|
41 |
+
return inputs, targets
|
42 |
+
|
43 |
+
def get_dataloader(data_path, vocab, batch_size=64, seq_length=50):
|
44 |
+
dataset = TextDataset(data_path, vocab, seq_length)
|
45 |
+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
|
46 |
+
return dataloader
|
47 |
+
|
48 |
+
def train_model(config):
|
49 |
+
# Check if vocab.json exists
|
50 |
+
if not os.path.exists(config['vocab_path']):
|
51 |
+
print("vocab.json not found. Running data_processing.py...")
|
52 |
+
subprocess.run(['python', 'src/data_processing.py'], check=True)
|
53 |
+
|
54 |
+
# Load vocabulary
|
55 |
+
vocab = load_vocab(config['vocab_path'])
|
56 |
+
vocab_size = len(vocab)
|
57 |
+
|
58 |
+
# Initialize model
|
59 |
+
model = TransformerModel(
|
60 |
+
vocab_size=vocab_size,
|
61 |
+
embed_size=config['embed_size'],
|
62 |
+
num_heads=config['num_heads'],
|
63 |
+
hidden_dim=config['hidden_dim'],
|
64 |
+
num_layers=config['num_layers'],
|
65 |
+
dropout=config['dropout']
|
66 |
+
)
|
67 |
+
|
68 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
69 |
+
model = model.to(device)
|
70 |
+
|
71 |
+
# Loss and optimizer
|
72 |
+
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
|
73 |
+
optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
|
74 |
+
|
75 |
+
# DataLoader
|
76 |
+
dataloader = get_dataloader(
|
77 |
+
data_path=config['data_path'],
|
78 |
+
vocab=vocab,
|
79 |
+
batch_size=config['batch_size'],
|
80 |
+
seq_length=config['seq_length']
|
81 |
+
)
|
82 |
+
|
83 |
+
# Training loop
|
84 |
+
model.train()
|
85 |
+
for epoch in range(1, config['epochs'] + 1):
|
86 |
+
epoch_loss = 0
|
87 |
+
progress = tqdm(dataloader, desc=f"Epoch {epoch}/{config['epochs']}")
|
88 |
+
for inputs, targets in progress:
|
89 |
+
inputs = inputs.to(device)
|
90 |
+
targets = targets.to(device)
|
91 |
+
|
92 |
+
optimizer.zero_grad()
|
93 |
+
src_mask = model.generate_square_subsequent_mask(inputs.size(1)).to(device)
|
94 |
+
outputs = model(inputs, src_mask)
|
95 |
+
loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
|
96 |
+
loss.backward()
|
97 |
+
optimizer.step()
|
98 |
+
|
99 |
+
epoch_loss += loss.item()
|
100 |
+
progress.set_postfix(loss=loss.item())
|
101 |
+
avg_loss = epoch_loss / len(dataloader)
|
102 |
+
print(f"Epoch {epoch} completed. Average Loss: {avg_loss:.4f}")
|
103 |
+
|
104 |
+
# Save model after each epoch
|
105 |
+
os.makedirs('models', exist_ok=True)
|
106 |
+
torch.save(model.state_dict(), f"models/3ed0k4_model_epoch{epoch}.pth")
|
107 |
+
print(f"Model saved at models/3ed0k4_model_epoch{epoch}.pth")
|
108 |
+
|
109 |
+
if __name__ == "__main__":
|
110 |
+
config = {
|
111 |
+
'vocab_path': 'vocab.json',
|
112 |
+
'data_path': 'data/processed/tokenized_data.json',
|
113 |
+
'embed_size': 256,
|
114 |
+
'num_heads': 8,
|
115 |
+
'hidden_dim': 512,
|
116 |
+
'num_layers': 4,
|
117 |
+
'dropout': 0.1,
|
118 |
+
'learning_rate': 0.001,
|
119 |
+
'batch_size': 64,
|
120 |
+
'seq_length': 50,
|
121 |
+
'epochs': 10
|
122 |
+
}
|
123 |
+
train_model(config)
|
src/upload_to_hf.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/upload_to_hf.py
|
2 |
+
from transformers import PreTrainedTokenizerFast
|
3 |
+
import torch
|
4 |
+
from model import TransformerModel
|
5 |
+
from utils import load_vocab
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
|
9 |
+
# Configuration
|
10 |
+
MODEL_PATH = 'models/3ed0k4_model_epoch10.pth' # Update this path
|
11 |
+
VOCAB_PATH = 'vocab.json'
|
12 |
+
TOKENIZER_DIR = 'tokenizer'
|
13 |
+
HF_MODEL_REPO = '3ed0k4/3ed0k4' # Replace with your Hugging Face repo
|
14 |
+
|
15 |
+
# Initialize tokenizer
|
16 |
+
def init_tokenizer(vocab):
|
17 |
+
tokenizer = PreTrainedTokenizerFast(tokenizer_file=None)
|
18 |
+
tokenizer.add_tokens(list(vocab.keys()))
|
19 |
+
tokenizer.save_pretrained(TOKENIZER_DIR)
|
20 |
+
print(f"Tokenizer saved to {TOKENIZER_DIR}/")
|
21 |
+
|
22 |
+
# Prepare model
|
23 |
+
def prepare_model(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout, model_path):
|
24 |
+
model = TransformerModel(
|
25 |
+
vocab_size=vocab_size,
|
26 |
+
embed_size=embed_size,
|
27 |
+
num_heads=num_heads,
|
28 |
+
hidden_dim=hidden_dim,
|
29 |
+
num_layers=num_layers,
|
30 |
+
dropout=dropout
|
31 |
+
)
|
32 |
+
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
|
33 |
+
model.eval()
|
34 |
+
# Save model
|
35 |
+
model.save_pretrained('.') # Saves state_dict; Hugging Face expects more
|
36 |
+
torch.save(model.state_dict(), 'pytorch_model.bin')
|
37 |
+
print("Model weights saved as pytorch_model.bin")
|
38 |
+
|
39 |
+
# Create config.json
|
40 |
+
def create_config(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout):
|
41 |
+
config = {
|
42 |
+
"vocab_size": vocab_size,
|
43 |
+
"embed_size": embed_size,
|
44 |
+
"num_heads": num_heads,
|
45 |
+
"hidden_dim": hidden_dim,
|
46 |
+
"num_layers": num_layers,
|
47 |
+
"dropout": dropout
|
48 |
+
}
|
49 |
+
with open('config.json', 'w') as f:
|
50 |
+
json.dump(config, f, indent=4)
|
51 |
+
print("Config saved as config.json")
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
54 |
+
# Load vocabulary
|
55 |
+
vocab = load_vocab(VOCAB_PATH)
|
56 |
+
vocab_size = len(vocab)
|
57 |
+
|
58 |
+
# Initialize tokenizer
|
59 |
+
init_tokenizer(vocab)
|
60 |
+
|
61 |
+
# Model parameters
|
62 |
+
embed_size = 256
|
63 |
+
num_heads = 8
|
64 |
+
hidden_dim = 512
|
65 |
+
num_layers = 4
|
66 |
+
dropout = 0.1
|
67 |
+
|
68 |
+
# Prepare and save model
|
69 |
+
prepare_model(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout, MODEL_PATH)
|
70 |
+
|
71 |
+
# Create config.json
|
72 |
+
create_config(vocab_size, embed_size, num_heads, hidden_dim, num_layers, dropout)
|
73 |
+
|
74 |
+
print("Model preparation for Hugging Face completed.")
|
src/utils.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/utils.py
|
2 |
+
import re
|
3 |
+
from collections import Counter
|
4 |
+
import json
|
5 |
+
|
6 |
+
def tokenize(text):
|
7 |
+
"""
|
8 |
+
Simple tokenizer that splits text into tokens based on whitespace and punctuation.
|
9 |
+
"""
|
10 |
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
11 |
+
return tokens
|
12 |
+
|
13 |
+
def build_vocab(tokenized_texts, min_freq=2):
|
14 |
+
"""
|
15 |
+
Builds a vocabulary dictionary from tokenized texts.
|
16 |
+
Tokens appearing fewer than `min_freq` times are excluded.
|
17 |
+
"""
|
18 |
+
counter = Counter()
|
19 |
+
for tokens in tokenized_texts:
|
20 |
+
counter.update(tokens)
|
21 |
+
vocab = {'<PAD>': 0, '<UNK>': 1}
|
22 |
+
for word, freq in counter.items():
|
23 |
+
if freq >= min_freq:
|
24 |
+
vocab[word] = len(vocab)
|
25 |
+
return vocab
|
26 |
+
|
27 |
+
def save_vocab(vocab, filepath='vocab.json'):
|
28 |
+
"""
|
29 |
+
Saves the vocabulary dictionary to a JSON file.
|
30 |
+
"""
|
31 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
32 |
+
json.dump(vocab, f, ensure_ascii=False, indent=4)
|
33 |
+
|
34 |
+
def load_vocab(filepath='vocab.json'):
|
35 |
+
"""
|
36 |
+
Loads the vocabulary dictionary from a JSON file.
|
37 |
+
"""
|
38 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
39 |
+
return json.load(f)
|