reh1t commited on
Commit
41a3de9
Β·
verified Β·
1 Parent(s): a909edc

uploaded project

Browse files
.gitattributes CHANGED
@@ -1,35 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.mov filter=lfs diff=lfs merge=lfs -text
2
+ cpp_to_pseudo_epoch_1.pth filter=lfs diff=lfs merge=lfs -text
3
+ transformer_epoch_1.pth filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import torch.nn as nn
4
+ import json
5
+ import math
6
+
7
+ # Configure the page for a wide layout.
8
+ st.set_page_config(page_title="Code Conversion Tool", layout="wide")
9
+
10
+ # Inject custom CSS for a modern, centered card design with a gradient background.
11
+ st.markdown(
12
+ """
13
+ <style>
14
+ /* Set a subtle gradient background for the page */
15
+ body {
16
+ background: linear-gradient(135deg, #ece9e6, #ffffff);
17
+ font-family: 'Helvetica Neue', sans-serif;
18
+ }
19
+ /* Center container for the main app */
20
+ .main-container {
21
+ max-width: 800px;
22
+ margin: 3rem auto;
23
+ padding: 1rem;
24
+ }
25
+ /* Card style for a clean content box */
26
+ .card {
27
+ background: #ffffff;
28
+ border-radius: 10px;
29
+ box-shadow: 0px 4px 8px rgba(0,0,0,0.1);
30
+ padding: 2rem;
31
+ }
32
+ /* Center headings and remove underline */
33
+ h1, h2, h3 {
34
+ text-align: center;
35
+ text-decoration: none;
36
+ }
37
+ /* Style for the translation button */
38
+ .stButton>button {
39
+ background-color: #4CAF50;
40
+ color: white;
41
+ border: none;
42
+ padding: 0.5rem 1.5rem;
43
+ border-radius: 5px;
44
+ font-size: 1rem;
45
+ cursor: pointer;
46
+ }
47
+ .stButton>button:hover {
48
+ background-color: #45a049;
49
+ }
50
+ </style>
51
+ """,
52
+ unsafe_allow_html=True
53
+ )
54
+
55
+ # Wrap the app content in a centered container.
56
+ with st.container():
57
+ # Change the title here.
58
+ st.title("Code Conversion Tool")
59
+
60
+ # Load vocabulary directly (no sidebar)
61
+ with open("vocabulary.json", "r") as f:
62
+ vocab = json.load(f)
63
+
64
+ # Define separate configuration classes
65
+ class PseudoToCppConfig:
66
+ # Config for Pseudocode β†’ C++ model
67
+ vocab_size = 12006
68
+ max_length = 100
69
+ embed_dim = 256
70
+ num_heads = 4
71
+ num_layers = 3
72
+ feedforward_dim = 512
73
+ dropout = 0.2
74
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
75
+
76
+ class CppToPseudoConfig:
77
+ # Config for C++ β†’ Pseudocode model
78
+ vocab_size = 12006
79
+ max_length = 100
80
+ embed_dim = 256
81
+ num_heads = 8
82
+ num_layers = 2
83
+ feedforward_dim = 512
84
+ dropout = 0.1
85
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
86
+
87
+ # Positional Encoding
88
+ class PositionalEncoding(nn.Module):
89
+ def __init__(self, embed_dim, max_len=100):
90
+ super(PositionalEncoding, self).__init__()
91
+ pe = torch.zeros(max_len, embed_dim)
92
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
93
+ div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
94
+ pe[:, 0::2] = torch.sin(position * div_term)
95
+ pe[:, 1::2] = torch.cos(position * div_term)
96
+ self.pe = pe.unsqueeze(0)
97
+
98
+ def forward(self, x):
99
+ return x + self.pe[:, :x.size(1)].to(x.device)
100
+
101
+ # Transformer Model
102
+ class Seq2SeqTransformer(nn.Module):
103
+ def __init__(self, config):
104
+ super(Seq2SeqTransformer, self).__init__()
105
+ self.config = config
106
+ self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)
107
+ self.positional_encoding = PositionalEncoding(config.embed_dim, config.max_length)
108
+ self.transformer = nn.Transformer(
109
+ d_model=config.embed_dim,
110
+ nhead=config.num_heads,
111
+ num_encoder_layers=config.num_layers,
112
+ num_decoder_layers=config.num_layers,
113
+ dim_feedforward=config.feedforward_dim,
114
+ dropout=config.dropout
115
+ )
116
+ self.fc_out = nn.Linear(config.embed_dim, config.vocab_size)
117
+
118
+ def forward(self, src, tgt):
119
+ src_emb = self.embedding(src) * math.sqrt(self.config.embed_dim)
120
+ tgt_emb = self.embedding(tgt) * math.sqrt(self.config.embed_dim)
121
+ src_emb = self.positional_encoding(src_emb)
122
+ tgt_emb = self.positional_encoding(tgt_emb)
123
+ out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2))
124
+ out = self.fc_out(out.permute(1, 0, 2))
125
+ return out
126
+
127
+ # Load Models with the appropriate configuration
128
+ @st.cache_resource
129
+ def load_model(path, config):
130
+ model = Seq2SeqTransformer(config).to(config.device)
131
+ model.load_state_dict(torch.load(path, map_location=config.device))
132
+ model.eval()
133
+ return model
134
+
135
+ cpp_to_pseudo_model = load_model("cpp_to_pseudo_epoch_1.pth", CppToPseudoConfig)
136
+ pseudo_to_cpp_model = load_model("transformer_epoch_1.pth", PseudoToCppConfig)
137
+
138
+ # Translation Function
139
+ def translate(model, input_tokens, vocab, device, max_length=50):
140
+ model.eval()
141
+ input_ids = [vocab.get(token, vocab["<unk>"]) for token in input_tokens]
142
+ input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)
143
+ output_ids = [vocab["<start>"]]
144
+ for _ in range(max_length):
145
+ output_tensor = torch.tensor(output_ids, dtype=torch.long).unsqueeze(0).to(device)
146
+ with torch.no_grad():
147
+ predictions = model(input_tensor, output_tensor)
148
+ next_token_id = predictions.argmax(dim=-1)[:, -1].item()
149
+ output_ids.append(next_token_id)
150
+ if next_token_id == vocab["<end>"]:
151
+ break
152
+ id_to_token = {idx: token for token, idx in vocab.items()}
153
+ return " ".join([id_to_token.get(idx, "<unk>") for idx in output_ids[1:]])
154
+
155
+ # UI Elements for Translation
156
+ mode = st.radio("Select Translation Mode", ("C++ β†’ Pseudocode", "Pseudocode β†’ C++"))
157
+ user_input = st.text_area("Enter code:")
158
+
159
+ if st.button("Translate"):
160
+ tokens = user_input.strip().split()
161
+ if mode == "C++ β†’ Pseudocode":
162
+ translated_code = translate(cpp_to_pseudo_model, tokens, vocab, CppToPseudoConfig.device)
163
+ else:
164
+ translated_code = translate(pseudo_to_cpp_model, tokens, vocab, PseudoToCppConfig.device)
165
+ st.subheader("Generated Translation:")
166
+ st.code(translated_code, language="cpp" if mode == "Pseudocode β†’ C++" else "python")
167
+
168
+ st.markdown('</div>', unsafe_allow_html=True)
169
+ st.markdown('</div>', unsafe_allow_html=True)
cpp_to_pseudo_epoch_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98ef6b0c38eeb68e6258f24aae44773996b89b3c624563369119004c5261c992
3
+ size 35210415
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+ torchvision
4
+ torchaudio
5
+ numpy
6
+ pandas
transformer.ipynb ADDED
@@ -0,0 +1,1518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "4ae62f55",
6
+ "metadata": {
7
+ "id": "4ae62f55"
8
+ },
9
+ "source": [
10
+ "# LOAD DATA"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "id": "fdf35712",
17
+ "metadata": {
18
+ "colab": {
19
+ "base_uri": "https://localhost:8080/"
20
+ },
21
+ "id": "fdf35712",
22
+ "outputId": "aba7cfe3-c992-452c-db12-378eea703c32"
23
+ },
24
+ "outputs": [
25
+ {
26
+ "output_type": "stream",
27
+ "name": "stderr",
28
+ "text": [
29
+ "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
30
+ "[nltk_data] Package punkt is already up-to-date!\n",
31
+ "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
32
+ "[nltk_data] Package punkt_tab is already up-to-date!\n",
33
+ "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
34
+ "[nltk_data] Package punkt is already up-to-date!\n"
35
+ ]
36
+ },
37
+ {
38
+ "output_type": "stream",
39
+ "name": "stdout",
40
+ "text": [
41
+ " text code workerid probid \\\n",
42
+ "0 NaN int main() { 1 3A \n",
43
+ "1 create string s string s; 1 3A \n",
44
+ "2 create integers x1, y1, x2, y2 int x1, y1, x2, y2; 1 3A \n",
45
+ "3 read s cin >> s; 1 3A \n",
46
+ "4 set x1 to s[0] - 96 x1 = s[0] - 96; 1 3A \n",
47
+ "\n",
48
+ " subid line indent \n",
49
+ "0 41470897 0 0 \n",
50
+ "1 41470897 1 1 \n",
51
+ "2 41470897 2 1 \n",
52
+ "3 41470897 3 1 \n",
53
+ "4 41470897 4 1 \n"
54
+ ]
55
+ }
56
+ ],
57
+ "source": [
58
+ "import pandas as pd\n",
59
+ "import nltk\n",
60
+ "from nltk.tokenize import word_tokenize\n",
61
+ "import nltk\n",
62
+ "\n",
63
+ "# Download the 'punkt' tokenizer data\n",
64
+ "nltk.download('punkt')\n",
65
+ "\n",
66
+ "# If 'punkt_tab' is still missing, try:\n",
67
+ "nltk.download('punkt_tab')\n",
68
+ "\n",
69
+ "\n",
70
+ "# Download punkt tokenizer\n",
71
+ "nltk.download('punkt')\n",
72
+ "\n",
73
+ "\n",
74
+ "# Load dataset\n",
75
+ "file_path = r\"/content/spoc-train-train.csv\" # Change to your dataset's path\n",
76
+ "df = pd.read_csv(file_path)\n",
77
+ "\n",
78
+ "# Display dataset information\n",
79
+ "print(df.head())\n"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": null,
85
+ "id": "6889f401",
86
+ "metadata": {
87
+ "colab": {
88
+ "base_uri": "https://localhost:8080/"
89
+ },
90
+ "id": "6889f401",
91
+ "outputId": "55269b88-418b-4fd5-d4ed-065109f899e7"
92
+ },
93
+ "outputs": [
94
+ {
95
+ "output_type": "stream",
96
+ "name": "stdout",
97
+ "text": [
98
+ "<class 'pandas.core.frame.DataFrame'>\n",
99
+ "RangeIndex: 246086 entries, 0 to 246085\n",
100
+ "Data columns (total 7 columns):\n",
101
+ " # Column Non-Null Count Dtype \n",
102
+ "--- ------ -------------- ----- \n",
103
+ " 0 text 181862 non-null object\n",
104
+ " 1 code 246086 non-null object\n",
105
+ " 2 workerid 246086 non-null int64 \n",
106
+ " 3 probid 246086 non-null object\n",
107
+ " 4 subid 246086 non-null int64 \n",
108
+ " 5 line 246086 non-null int64 \n",
109
+ " 6 indent 246086 non-null int64 \n",
110
+ "dtypes: int64(4), object(3)\n",
111
+ "memory usage: 13.1+ MB\n",
112
+ "None\n"
113
+ ]
114
+ }
115
+ ],
116
+ "source": [
117
+ "print(df.info())\n"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "markdown",
122
+ "id": "dd3ad88e",
123
+ "metadata": {
124
+ "id": "dd3ad88e"
125
+ },
126
+ "source": [
127
+ "# DATA PREPROCESSING"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": null,
133
+ "id": "3a08429f",
134
+ "metadata": {
135
+ "id": "3a08429f"
136
+ },
137
+ "outputs": [],
138
+ "source": [
139
+ "from nltk.tokenize import word_tokenize\n",
140
+ "\n",
141
+ "df[\"text\"] = df[\"text\"].astype(str)\n",
142
+ "df[\"text_tokens\"] = df[\"text\"].apply(word_tokenize)\n"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": null,
148
+ "id": "80e52203",
149
+ "metadata": {
150
+ "id": "80e52203"
151
+ },
152
+ "outputs": [],
153
+ "source": [
154
+ "df[\"text\"] = df[\"text\"].fillna(\"\") # Replace NaN with empty strings\n",
155
+ "df[\"text_tokens\"] = df[\"text\"].apply(word_tokenize)\n"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": null,
161
+ "id": "3e73cd24",
162
+ "metadata": {
163
+ "id": "3e73cd24"
164
+ },
165
+ "outputs": [],
166
+ "source": [
167
+ "df[\"text_tokens\"] = df[\"text\"].apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])\n"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "id": "123f02bd",
174
+ "metadata": {
175
+ "colab": {
176
+ "base_uri": "https://localhost:8080/"
177
+ },
178
+ "id": "123f02bd",
179
+ "outputId": "cf09d02c-e55b-4857-b46f-1aee035bab36"
180
+ },
181
+ "outputs": [
182
+ {
183
+ "output_type": "stream",
184
+ "name": "stdout",
185
+ "text": [
186
+ "object\n",
187
+ "0\n",
188
+ "Empty DataFrame\n",
189
+ "Columns: [text, code, workerid, probid, subid, line, indent, text_tokens]\n",
190
+ "Index: []\n"
191
+ ]
192
+ }
193
+ ],
194
+ "source": [
195
+ "print(df[\"text\"].dtype) # Check the column's data type\n",
196
+ "print(df[\"text\"].isna().sum()) # Count missing values\n",
197
+ "print(df[df[\"text\"].apply(lambda x: not isinstance(x, str))]) # Show non-string values\n"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "markdown",
202
+ "id": "7666823c",
203
+ "metadata": {
204
+ "id": "7666823c"
205
+ },
206
+ "source": [
207
+ "# TOKENIZATION"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": null,
213
+ "id": "1527b229",
214
+ "metadata": {
215
+ "colab": {
216
+ "base_uri": "https://localhost:8080/"
217
+ },
218
+ "id": "1527b229",
219
+ "outputId": "3911e731-9c6a-40a7-f625-e59d4491d13e"
220
+ },
221
+ "outputs": [
222
+ {
223
+ "output_type": "stream",
224
+ "name": "stderr",
225
+ "text": [
226
+ "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
227
+ "[nltk_data] Package punkt is already up-to-date!\n"
228
+ ]
229
+ }
230
+ ],
231
+ "source": [
232
+ "import nltk\n",
233
+ "from nltk.tokenize import word_tokenize\n",
234
+ "\n",
235
+ "# Download tokenizer if not available\n",
236
+ "nltk.download('punkt')\n",
237
+ "\n",
238
+ "# Tokenizing pseudocode and code\n",
239
+ "df[\"text_tokens\"] = df[\"text\"].apply(word_tokenize)\n",
240
+ "df[\"code_tokens\"] = df[\"code\"].apply(word_tokenize)\n",
241
+ "\n",
242
+ "\n"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "markdown",
247
+ "id": "62e8e137",
248
+ "metadata": {
249
+ "id": "62e8e137"
250
+ },
251
+ "source": [
252
+ "# PRINT TOKEN SAMPLES"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": null,
258
+ "id": "3667d51e",
259
+ "metadata": {
260
+ "colab": {
261
+ "base_uri": "https://localhost:8080/"
262
+ },
263
+ "id": "3667d51e",
264
+ "outputId": "9619ff63-be61-467c-a1a4-0cf13b14fced"
265
+ },
266
+ "outputs": [
267
+ {
268
+ "output_type": "stream",
269
+ "name": "stdout",
270
+ "text": [
271
+ "Samples from index 1-10:\n",
272
+ "Index 1:\n",
273
+ "Tokenized Pseudocode: ['create', 'string', 's']\n",
274
+ "Tokenized C++ Code: ['string', 's', ';']\n",
275
+ "--------------------------------------------------\n",
276
+ "Index 2:\n",
277
+ "Tokenized Pseudocode: ['create', 'integers', 'x1', ',', 'y1', ',', 'x2', ',', 'y2']\n",
278
+ "Tokenized C++ Code: ['int', 'x1', ',', 'y1', ',', 'x2', ',', 'y2', ';']\n",
279
+ "--------------------------------------------------\n",
280
+ "Index 3:\n",
281
+ "Tokenized Pseudocode: ['read', 's']\n",
282
+ "Tokenized C++ Code: ['cin', '>', '>', 's', ';']\n",
283
+ "--------------------------------------------------\n",
284
+ "Index 4:\n",
285
+ "Tokenized Pseudocode: ['set', 'x1', 'to', 's', '[', '0', ']', '-', '96']\n",
286
+ "Tokenized C++ Code: ['x1', '=', 's', '[', '0', ']', '-', '96', ';']\n",
287
+ "--------------------------------------------------\n",
288
+ "Index 5:\n",
289
+ "Tokenized Pseudocode: ['set', 'y1', 'to', 's', '[', '1', ']', '-', \"'\", '0', \"'\"]\n",
290
+ "Tokenized C++ Code: ['y1', '=', 's', '[', '1', ']', '-', \"'\", '0', \"'\", ';']\n",
291
+ "--------------------------------------------------\n",
292
+ "Index 6:\n",
293
+ "Tokenized Pseudocode: ['read', 's']\n",
294
+ "Tokenized C++ Code: ['cin', '>', '>', 's', ';']\n",
295
+ "--------------------------------------------------\n",
296
+ "Index 7:\n",
297
+ "Tokenized Pseudocode: ['set', 'x2', 'to', 's', '[', '0', ']', '-', '96']\n",
298
+ "Tokenized C++ Code: ['x2', '=', 's', '[', '0', ']', '-', '96', ';']\n",
299
+ "--------------------------------------------------\n",
300
+ "Index 8:\n",
301
+ "Tokenized Pseudocode: ['set', 'y2', 'to', 's', '[', '1', ']', '-', \"'\", '0', \"'\"]\n",
302
+ "Tokenized C++ Code: ['y2', '=', 's', '[', '1', ']', '-', \"'\", '0', \"'\", ';']\n",
303
+ "--------------------------------------------------\n",
304
+ "Index 9:\n",
305
+ "Tokenized Pseudocode: ['print', 'maximum', 'of', 'absolute', 'value', 'of', 'x1', '-', 'x2', 'and', 'absolute', 'value', 'of', 'y1', '-', 'y2', ',', 'print', 'newline']\n",
306
+ "Tokenized C++ Code: ['cout', '<', '<', 'max', '(', 'abs', '(', 'x1', '-', 'x2', ')', ',', 'abs', '(', 'y1', '-', 'y2', ')', ')', '<', '<', 'endl', ';']\n",
307
+ "--------------------------------------------------\n",
308
+ "Index 10:\n",
309
+ "Tokenized Pseudocode: ['while', 'x1', 'is', 'not', 'x2', 'or', 'y1', 'is', 'not', 'y2']\n",
310
+ "Tokenized C++ Code: ['while', '(', 'x1', '!', '=', 'x2', '||', 'y1', '!', '=', 'y2', ')', '{']\n",
311
+ "--------------------------------------------------\n",
312
+ "\n",
313
+ "Samples from index 20-30:\n",
314
+ "Index 20:\n",
315
+ "Tokenized Pseudocode: ['print', '``', 'D', \"''\"]\n",
316
+ "Tokenized C++ Code: ['cout', '<', '<', '``', 'D', \"''\", ';']\n",
317
+ "--------------------------------------------------\n",
318
+ "Index 21:\n",
319
+ "Tokenized Pseudocode: ['decrement', 'y1']\n",
320
+ "Tokenized C++ Code: ['y1', '--', ';']\n",
321
+ "--------------------------------------------------\n",
322
+ "Index 22:\n",
323
+ "Tokenized Pseudocode: ['nan']\n",
324
+ "Tokenized C++ Code: ['}']\n",
325
+ "--------------------------------------------------\n",
326
+ "Index 23:\n",
327
+ "Tokenized Pseudocode: ['if', 'y1', 'is', 'less', 'than', 'y2']\n",
328
+ "Tokenized C++ Code: ['if', '(', 'y1', '<', 'y2', ')', '{']\n",
329
+ "--------------------------------------------------\n",
330
+ "Index 24:\n",
331
+ "Tokenized Pseudocode: ['print', '``', 'U', \"''\"]\n",
332
+ "Tokenized C++ Code: ['cout', '<', '<', '``', 'U', \"''\", ';']\n",
333
+ "--------------------------------------------------\n",
334
+ "Index 25:\n",
335
+ "Tokenized Pseudocode: ['increment', 'y1']\n",
336
+ "Tokenized C++ Code: ['y1++', ';']\n",
337
+ "--------------------------------------------------\n",
338
+ "Index 26:\n",
339
+ "Tokenized Pseudocode: ['nan']\n",
340
+ "Tokenized C++ Code: ['}']\n",
341
+ "--------------------------------------------------\n",
342
+ "Index 27:\n",
343
+ "Tokenized Pseudocode: ['print', '``', '\\\\n', \"''\"]\n",
344
+ "Tokenized C++ Code: ['cout', '<', '<', '``', '\\\\n', \"''\", ';']\n",
345
+ "--------------------------------------------------\n",
346
+ "Index 28:\n",
347
+ "Tokenized Pseudocode: ['nan']\n",
348
+ "Tokenized C++ Code: ['}']\n",
349
+ "--------------------------------------------------\n",
350
+ "Index 29:\n",
351
+ "Tokenized Pseudocode: ['nan']\n",
352
+ "Tokenized C++ Code: ['return', '0', ';']\n",
353
+ "--------------------------------------------------\n",
354
+ "Index 30:\n",
355
+ "Tokenized Pseudocode: ['nan']\n",
356
+ "Tokenized C++ Code: ['}']\n",
357
+ "--------------------------------------------------\n"
358
+ ]
359
+ }
360
+ ],
361
+ "source": [
362
+ "# Print samples from index 1-10\n",
363
+ "print(\"Samples from index 1-10:\")\n",
364
+ "for i in range(1, 11):\n",
365
+ " print(f\"Index {i}:\")\n",
366
+ " print(\"Tokenized Pseudocode:\", df[\"text_tokens\"].iloc[i])\n",
367
+ " print(\"Tokenized C++ Code:\", df[\"code_tokens\"].iloc[i])\n",
368
+ " print(\"-\" * 50)\n",
369
+ "\n",
370
+ "# Print samples from index 20-30\n",
371
+ "print(\"\\nSamples from index 20-30:\")\n",
372
+ "for i in range(20, 31):\n",
373
+ " print(f\"Index {i}:\")\n",
374
+ " print(\"Tokenized Pseudocode:\", df[\"text_tokens\"].iloc[i])\n",
375
+ " print(\"Tokenized C++ Code:\", df[\"code_tokens\"].iloc[i])\n",
376
+ " print(\"-\" * 50)\n"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "source": [
382
+ "# Save tokenized pseudocode and C++ code to CSV\n",
383
+ "output_file = \"tokenized_spoc.csv\"\n",
384
+ "df[[\"text_tokens\", \"code_tokens\"]].to_csv(output_file, index=False)\n",
385
+ "\n",
386
+ "print(f\"Tokenized data saved to {output_file}\")\n"
387
+ ],
388
+ "metadata": {
389
+ "id": "Sd8I0TttbCaZ",
390
+ "colab": {
391
+ "base_uri": "https://localhost:8080/"
392
+ },
393
+ "outputId": "05541ebd-454f-455d-e2a3-57f6df689997"
394
+ },
395
+ "id": "Sd8I0TttbCaZ",
396
+ "execution_count": null,
397
+ "outputs": [
398
+ {
399
+ "output_type": "stream",
400
+ "name": "stdout",
401
+ "text": [
402
+ "Tokenized data saved to tokenized_spoc.csv\n"
403
+ ]
404
+ }
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "source": [
410
+ "# Add start and end tokens to tokenized C++ code\n",
411
+ "df[\"code_tokens\"] = df[\"code_tokens\"].apply(lambda tokens: [\"<start>\"] + tokens + [\"<end>\"])\n",
412
+ "\n",
413
+ "# Save updated tokenized data to CSV\n",
414
+ "output_file = \"tokenized_spoc_with_tokens.csv\"\n",
415
+ "df[[\"text_tokens\", \"code_tokens\"]].to_csv(output_file, index=False)\n",
416
+ "\n",
417
+ "print(f\"Updated tokenized data saved to {output_file}\")\n"
418
+ ],
419
+ "metadata": {
420
+ "colab": {
421
+ "base_uri": "https://localhost:8080/"
422
+ },
423
+ "id": "-HAGLVzqXEQy",
424
+ "outputId": "59f572b7-bf31-4171-9112-2edec63c3937"
425
+ },
426
+ "id": "-HAGLVzqXEQy",
427
+ "execution_count": null,
428
+ "outputs": [
429
+ {
430
+ "output_type": "stream",
431
+ "name": "stdout",
432
+ "text": [
433
+ "Updated tokenized data saved to tokenized_spoc_with_tokens.csv\n"
434
+ ]
435
+ }
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "source": [
441
+ "# Make \"text_tokens\" and \"code_tokens\" length same by padding with \"<pad>\"\n",
442
+ "max_len = max(df[\"text_tokens\"].apply(len).max(), df[\"code_tokens\"].apply(len).max())\n",
443
+ "\n",
444
+ "df[\"text_tokens\"] = df[\"text_tokens\"].apply(lambda tokens: tokens + [\"<pad>\"] * (max_len - len(tokens)))\n",
445
+ "df[\"code_tokens\"] = df[\"code_tokens\"].apply(lambda tokens: tokens + [\"<pad>\"] * (max_len - len(tokens)))\n",
446
+ "\n",
447
+ "# Save padded tokenized data to CSV\n",
448
+ "output_file = \"tokenized_spoc_padded.csv\"\n",
449
+ "df[[\"text_tokens\", \"code_tokens\"]].to_csv(output_file, index=False)\n",
450
+ "\n",
451
+ "print(f\"Padded tokenized data saved to {output_file}\")\n"
452
+ ],
453
+ "metadata": {
454
+ "colab": {
455
+ "base_uri": "https://localhost:8080/"
456
+ },
457
+ "id": "6tRCwKS0X25B",
458
+ "outputId": "32366e38-5c7b-4c45-cc7a-6413dc6df8da"
459
+ },
460
+ "id": "6tRCwKS0X25B",
461
+ "execution_count": null,
462
+ "outputs": [
463
+ {
464
+ "output_type": "stream",
465
+ "name": "stdout",
466
+ "text": [
467
+ "Padded tokenized data saved to tokenized_spoc_padded.csv\n"
468
+ ]
469
+ }
470
+ ]
471
+ },
472
+ {
473
+ "cell_type": "code",
474
+ "source": [
475
+ "import json\n",
476
+ "\n",
477
+ "# Define special tokens with fixed indices\n",
478
+ "vocab = {\n",
479
+ " \"<unk>\": 0,\n",
480
+ " \"<pad>\": 1,\n",
481
+ " \"<start>\": 2,\n",
482
+ " \"<end>\": 3\n",
483
+ "}\n",
484
+ "\n",
485
+ "# Assign indices to other tokens\n",
486
+ "for column in [\"text_tokens\", \"code_tokens\"]:\n",
487
+ " for tokens in df[column]:\n",
488
+ " for token in tokens:\n",
489
+ " if token not in vocab:\n",
490
+ " vocab[token] = len(vocab)\n",
491
+ "\n",
492
+ "# Save vocabulary to JSON\n",
493
+ "vocab_file = \"vocabulary.json\"\n",
494
+ "with open(vocab_file, \"w\") as f:\n",
495
+ " json.dump(vocab, f, indent=4)\n",
496
+ "\n",
497
+ "print(f\"Vocabulary saved to {vocab_file}\")\n"
498
+ ],
499
+ "metadata": {
500
+ "colab": {
501
+ "base_uri": "https://localhost:8080/"
502
+ },
503
+ "id": "r4hbVXb5YI4-",
504
+ "outputId": "36805383-5738-4634-fc97-472fd68c399c"
505
+ },
506
+ "id": "r4hbVXb5YI4-",
507
+ "execution_count": null,
508
+ "outputs": [
509
+ {
510
+ "output_type": "stream",
511
+ "name": "stdout",
512
+ "text": [
513
+ "Vocabulary saved to vocabulary.json\n"
514
+ ]
515
+ }
516
+ ]
517
+ },
518
+ {
519
+ "cell_type": "code",
520
+ "source": [
521
+ "# Load vocabulary\n",
522
+ "with open(\"vocabulary.json\", \"r\") as f:\n",
523
+ " vocab = json.load(f)\n",
524
+ "\n",
525
+ "# Load tokenized data\n",
526
+ "df = pd.read_csv(\"/content/tokenized_spoc_padded.csv\")\n",
527
+ "\n",
528
+ "# Convert string tokens to lists\n",
529
+ "df[\"text_tokens\"] = df[\"text_tokens\"].apply(eval)\n",
530
+ "df[\"code_tokens\"] = df[\"code_tokens\"].apply(eval)\n",
531
+ "\n",
532
+ "# Convert tokens to sequences using vocabulary\n",
533
+ "df[\"text_sequences\"] = df[\"text_tokens\"].apply(lambda tokens: [vocab.get(token, vocab[\"<unk>\"]) for token in tokens])\n",
534
+ "df[\"code_sequences\"] = df[\"code_tokens\"].apply(lambda tokens: [vocab.get(token, vocab[\"<unk>\"]) for token in tokens])\n",
535
+ "\n",
536
+ "# Save sequences to CSV\n",
537
+ "output_file = \"tokenized_sequences.csv\"\n",
538
+ "df[[\"text_sequences\", \"code_sequences\"]].to_csv(output_file, index=False)\n",
539
+ "\n",
540
+ "print(f\"Tokenized sequences saved to {output_file}\")\n"
541
+ ],
542
+ "metadata": {
543
+ "colab": {
544
+ "base_uri": "https://localhost:8080/"
545
+ },
546
+ "id": "_TEFKw4KY6VO",
547
+ "outputId": "53b130fa-6c16-4356-9979-3a87b00e53ca"
548
+ },
549
+ "id": "_TEFKw4KY6VO",
550
+ "execution_count": null,
551
+ "outputs": [
552
+ {
553
+ "output_type": "stream",
554
+ "name": "stdout",
555
+ "text": [
556
+ "Tokenized sequences saved to tokenized_sequences.csv\n"
557
+ ]
558
+ }
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "source": [
564
+ "from torch.utils.data import DataLoader, Dataset\n",
565
+ "import pandas as pd\n",
566
+ "import torch\n",
567
+ "import ast\n",
568
+ "from torch.nn.utils.rnn import pad_sequence\n",
569
+ "from tqdm import tqdm\n",
570
+ "\n",
571
+ "class DataLoad(Dataset):\n",
572
+ " def __init__(self, file_path):\n",
573
+ " df = pd.read_csv(file_path)\n",
574
+ " self.inputs = [ast.literal_eval(x) for x in df['text_sequences']]\n",
575
+ " self.outputs = [ast.literal_eval(x) for x in df['code_sequences']]\n",
576
+ "\n",
577
+ " def __len__(self):\n",
578
+ " return len(self.inputs)\n",
579
+ "\n",
580
+ " def __getitem__(self, idx):\n",
581
+ " input_tensor = torch.tensor(self.inputs[idx], dtype=torch.int64)\n",
582
+ " output_tensor = torch.tensor(self.outputs[idx], dtype=torch.int64)\n",
583
+ " return input_tensor, output_tensor\n",
584
+ "\n",
585
+ "def Add_Pad(batch):\n",
586
+ " inputs, outputs = zip(*batch)\n",
587
+ " inputs = pad_sequence(inputs, batch_first=True, padding_value=0)\n",
588
+ " outputs = pad_sequence(outputs, batch_first=True, padding_value=0)\n",
589
+ " return inputs, outputs\n",
590
+ "\n",
591
+ "# Load dataset and dataloader\n",
592
+ "dataset = DataLoad('/content/tokenized_sequences.csv')\n",
593
+ "dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=Add_Pad)\n",
594
+ "\n",
595
+ "# Iterate with progress bar\n",
596
+ "data_iter = iter(dataloader)\n",
597
+ "for batch in tqdm(dataloader, desc=\"Loading Batches\"):\n",
598
+ " features, labels = batch # Get a batch of data\n",
599
+ " break # Remove this if you want to iterate over all batches\n",
600
+ "\n",
601
+ "print(\"Batch loaded successfully!\")\n"
602
+ ],
603
+ "metadata": {
604
+ "colab": {
605
+ "base_uri": "https://localhost:8080/"
606
+ },
607
+ "id": "8_ySZqiqaHUD",
608
+ "outputId": "87124b59-8b7b-42ba-bba0-9c9a0ca1926c"
609
+ },
610
+ "id": "8_ySZqiqaHUD",
611
+ "execution_count": 16,
612
+ "outputs": [
613
+ {
614
+ "output_type": "stream",
615
+ "name": "stderr",
616
+ "text": [
617
+ "Loading Batches: 0%| | 0/3846 [00:00<?, ?it/s]"
618
+ ]
619
+ },
620
+ {
621
+ "output_type": "stream",
622
+ "name": "stdout",
623
+ "text": [
624
+ "Batch loaded successfully!\n"
625
+ ]
626
+ },
627
+ {
628
+ "output_type": "stream",
629
+ "name": "stderr",
630
+ "text": [
631
+ "\n"
632
+ ]
633
+ }
634
+ ]
635
+ },
636
+ {
637
+ "cell_type": "code",
638
+ "source": [
639
+ "print(features)\n",
640
+ "print(labels)"
641
+ ],
642
+ "metadata": {
643
+ "colab": {
644
+ "base_uri": "https://localhost:8080/"
645
+ },
646
+ "id": "BeXmffD0bl4E",
647
+ "outputId": "072ef76d-d277-4357-c108-b7529cc5cc95"
648
+ },
649
+ "id": "BeXmffD0bl4E",
650
+ "execution_count": null,
651
+ "outputs": [
652
+ {
653
+ "output_type": "stream",
654
+ "name": "stdout",
655
+ "text": [
656
+ "tensor([[ 77, 616, 16, ..., 1, 1, 1],\n",
657
+ " [ 4, 1, 1, ..., 1, 1, 1],\n",
658
+ " [2998, 378, 67, ..., 1, 1, 1],\n",
659
+ " ...,\n",
660
+ " [ 4, 1, 1, ..., 1, 1, 1],\n",
661
+ " [ 4, 1, 1, ..., 1, 1, 1],\n",
662
+ " [ 168, 8, 179, ..., 1, 1, 1]])\n",
663
+ "tensor([[ 2, 77, 50, ..., 1, 1, 1],\n",
664
+ " [ 2, 173, 18, ..., 1, 1, 1],\n",
665
+ " [ 2, 67, 87, ..., 1, 1, 1],\n",
666
+ " ...,\n",
667
+ " [ 2, 328, 3, ..., 1, 1, 1],\n",
668
+ " [ 2, 328, 3, ..., 1, 1, 1],\n",
669
+ " [ 2, 108, 179, ..., 1, 1, 1]])\n"
670
+ ]
671
+ }
672
+ ]
673
+ },
674
+ {
675
+ "cell_type": "code",
676
+ "source": [
677
+ "import torch\n",
678
+ "import torch.nn as nn\n",
679
+ "import torch.optim as optim\n",
680
+ "import math\n",
681
+ "\n",
682
+ "# Transformer Hyperparameters\n",
683
+ "class Config:\n",
684
+ " vocab_size = 12006 # Adjust based on vocabulary.json\n",
685
+ " max_length = 100 # Adjust based on sequence length\n",
686
+ " embed_dim = 256\n",
687
+ " num_heads = 8\n",
688
+ " num_layers =2\n",
689
+ " feedforward_dim = 512\n",
690
+ " dropout = 0.1\n",
691
+ " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
692
+ "\n",
693
+ "config = Config()\n",
694
+ "\n",
695
+ "# Positional Encoding\n",
696
+ "class PositionalEncoding(nn.Module):\n",
697
+ " def __init__(self, embed_dim, max_len=100):\n",
698
+ " super(PositionalEncoding, self).__init__()\n",
699
+ " pe = torch.zeros(max_len, embed_dim)\n",
700
+ " position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n",
701
+ " div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))\n",
702
+ " pe[:, 0::2] = torch.sin(position * div_term)\n",
703
+ " pe[:, 1::2] = torch.cos(position * div_term)\n",
704
+ " self.pe = pe.unsqueeze(0) # Shape: (1, max_len, embed_dim)\n",
705
+ "\n",
706
+ " def forward(self, x):\n",
707
+ " return x + self.pe[:, :x.size(1)].to(x.device)\n",
708
+ "\n",
709
+ "# Transformer Model\n",
710
+ "class PseudoCodeTransformer(nn.Module):\n",
711
+ " def __init__(self, config):\n",
712
+ " super(PseudoCodeTransformer, self).__init__()\n",
713
+ " self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)\n",
714
+ " self.positional_encoding = PositionalEncoding(config.embed_dim, config.max_length)\n",
715
+ "\n",
716
+ " self.transformer = nn.Transformer(\n",
717
+ " d_model=config.embed_dim,\n",
718
+ " nhead=config.num_heads,\n",
719
+ " num_encoder_layers=config.num_layers,\n",
720
+ " num_decoder_layers=config.num_layers,\n",
721
+ " dim_feedforward=config.feedforward_dim,\n",
722
+ " dropout=config.dropout\n",
723
+ " )\n",
724
+ "\n",
725
+ " self.fc_out = nn.Linear(config.embed_dim, config.vocab_size)\n",
726
+ " self.dropout = nn.Dropout(config.dropout)\n",
727
+ "\n",
728
+ " def generate_square_subsequent_mask(self, sz):\n",
729
+ " return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(config.device)\n",
730
+ "\n",
731
+ " def forward(self, src, tgt):\n",
732
+ " src_emb = self.embedding(src) * math.sqrt(config.embed_dim)\n",
733
+ " tgt_emb = self.embedding(tgt) * math.sqrt(config.embed_dim)\n",
734
+ "\n",
735
+ " src_emb = self.positional_encoding(src_emb)\n",
736
+ " tgt_emb = self.positional_encoding(tgt_emb)\n",
737
+ "\n",
738
+ " src_mask = self.generate_square_subsequent_mask(src.size(1))\n",
739
+ " tgt_mask = self.generate_square_subsequent_mask(tgt.size(1))\n",
740
+ "\n",
741
+ " out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),\n",
742
+ " src_mask=src_mask, tgt_mask=tgt_mask)\n",
743
+ "\n",
744
+ " out = self.fc_out(out.permute(1, 0, 2)) # Convert back to batch-first\n",
745
+ " return out\n",
746
+ "\n",
747
+ "# Initialize Model\n",
748
+ "model = PseudoCodeTransformer(config).to(config.device)\n",
749
+ "print(\"Model initialized successfully!\")\n"
750
+ ],
751
+ "metadata": {
752
+ "colab": {
753
+ "base_uri": "https://localhost:8080/"
754
+ },
755
+ "id": "azPgilarcWXf",
756
+ "outputId": "143ee579-1fbf-4ff8-b54f-e4b2116245e3"
757
+ },
758
+ "id": "azPgilarcWXf",
759
+ "execution_count": 2,
760
+ "outputs": [
761
+ {
762
+ "output_type": "stream",
763
+ "name": "stderr",
764
+ "text": [
765
+ "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
766
+ " warnings.warn(\n"
767
+ ]
768
+ },
769
+ {
770
+ "output_type": "stream",
771
+ "name": "stdout",
772
+ "text": [
773
+ "Model initialized successfully!\n"
774
+ ]
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "cell_type": "code",
780
+ "source": [
781
+ "def translate(model, pseudocode_tokens, vocab, device, max_length=50):\n",
782
+ " model.eval()\n",
783
+ "\n",
784
+ " # Convert pseudocode tokens to numerical indices\n",
785
+ " input_ids = [vocab.get(token, vocab[\"<unk>\"]) for token in pseudocode_tokens]\n",
786
+ " input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device) # Add batch dimension\n",
787
+ "\n",
788
+ " # Start token for generation\n",
789
+ " output_ids = [vocab[\"<start>\"]]\n",
790
+ "\n",
791
+ " for _ in range(max_length):\n",
792
+ " output_tensor = torch.tensor(output_ids, dtype=torch.long).unsqueeze(0).to(device)\n",
793
+ "\n",
794
+ " # Get model predictions\n",
795
+ " with torch.no_grad():\n",
796
+ " predictions = model(input_tensor, output_tensor)\n",
797
+ "\n",
798
+ " # Select the most probable token\n",
799
+ " next_token_id = predictions.argmax(dim=-1)[:, -1].item()\n",
800
+ " output_ids.append(next_token_id)\n",
801
+ "\n",
802
+ " # Stop if end token is generated\n",
803
+ " if next_token_id == vocab[\"<end>\"]:\n",
804
+ " break\n",
805
+ "\n",
806
+ " # Convert token indices back to words\n",
807
+ " id_to_token = {idx: token for token, idx in vocab.items()}\n",
808
+ " translated_code = [id_to_token.get(idx, \"<unk>\") for idx in output_ids[1:]] # Exclude <start> token\n",
809
+ "\n",
810
+ " return \" \".join(translated_code)\n"
811
+ ],
812
+ "metadata": {
813
+ "id": "2XsYwb5jLxAT"
814
+ },
815
+ "id": "2XsYwb5jLxAT",
816
+ "execution_count": 5,
817
+ "outputs": []
818
+ },
819
+ {
820
+ "cell_type": "code",
821
+ "source": [
822
+ "import json\n",
823
+ "\n",
824
+ "# Load vocabulary\n",
825
+ "with open(\"vocabulary.json\", \"r\") as f:\n",
826
+ " vocab = json.load(f)\n",
827
+ "\n",
828
+ "# Ensure vocab is a dictionary\n",
829
+ "print(f\"βœ… Vocabulary loaded with {len(vocab)} tokens\")\n"
830
+ ],
831
+ "metadata": {
832
+ "colab": {
833
+ "base_uri": "https://localhost:8080/"
834
+ },
835
+ "id": "OJBeh_zNL6ZM",
836
+ "outputId": "44bee4c9-e9ea-4c09-fa91-a25e4efb3e7d"
837
+ },
838
+ "id": "OJBeh_zNL6ZM",
839
+ "execution_count": 7,
840
+ "outputs": [
841
+ {
842
+ "output_type": "stream",
843
+ "name": "stdout",
844
+ "text": [
845
+ "βœ… Vocabulary loaded with 12006 tokens\n"
846
+ ]
847
+ }
848
+ ]
849
+ },
850
+ {
851
+ "cell_type": "code",
852
+ "source": [
853
+ "from torch.utils.data import DataLoader\n",
854
+ "import torch.nn.functional as F\n",
855
+ "from tqdm import tqdm\n",
856
+ "import os\n",
857
+ "\n",
858
+ "# Check for GPU availability\n",
859
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
860
+ "print(f\"πŸ”Ή Using device: {device}\")\n",
861
+ "\n",
862
+ "# Move model to device\n",
863
+ "model.to(device)\n",
864
+ "\n",
865
+ "# Loss Function & Optimizer\n",
866
+ "criterion = nn.CrossEntropyLoss(ignore_index=1) # Ignore padding token\n",
867
+ "optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-3)\n",
868
+ "\n",
869
+ "# Create directory to save models\n",
870
+ "os.makedirs(\"checkpoints\", exist_ok=True)\n",
871
+ "\n",
872
+ "# Training Loop\n",
873
+ "num_epochs = 1\n",
874
+ "for epoch in range(num_epochs):\n",
875
+ " model.train()\n",
876
+ " epoch_loss = 0\n",
877
+ "\n",
878
+ " progress_bar = tqdm(dataloader, desc=f\"Epoch {epoch+1}/{num_epochs}\")\n",
879
+ " for batch in progress_bar:\n",
880
+ " src, tgt = batch\n",
881
+ " src, tgt = src.to(device), tgt.to(device) # Move batch to GPU\n",
882
+ "\n",
883
+ " tgt_input = tgt[:, :-1] # Remove <end> token\n",
884
+ " tgt_output = tgt[:, 1:] # Shifted version\n",
885
+ "\n",
886
+ " optimizer.zero_grad()\n",
887
+ " output = model(src, tgt_input)\n",
888
+ "\n",
889
+ " loss = criterion(output.view(-1, config.vocab_size), tgt_output.contiguous().view(-1))\n",
890
+ " loss.backward()\n",
891
+ " optimizer.step()\n",
892
+ "\n",
893
+ " epoch_loss += loss.item()\n",
894
+ " progress_bar.set_postfix(loss=loss.item())\n",
895
+ "\n",
896
+ " avg_loss = epoch_loss / len(dataloader)\n",
897
+ " print(f\"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}\")\n",
898
+ "\n",
899
+ " # Save Model Checkpoint\n",
900
+ " torch.save(model.state_dict(), f\"checkpoints/transformer_epoch_{epoch+1}.pth\")\n",
901
+ " print(f\"βœ… Model saved: checkpoints/transformer_epoch_{epoch+1}.pth\")\n",
902
+ "\n",
903
+ " # Print Example Prediction\n",
904
+ " model.eval()\n",
905
+ " example_pseudocode = [\"create\", \"integer\", \"x\"]\n",
906
+ " translated_code = translate(model, example_pseudocode, vocab, device)\n",
907
+ " print(f\"πŸ”Ή Example Prediction (Pseudocode β†’ C++): {translated_code}\\n\")\n"
908
+ ],
909
+ "metadata": {
910
+ "id": "SUOIS04idMXB",
911
+ "colab": {
912
+ "base_uri": "https://localhost:8080/"
913
+ },
914
+ "outputId": "9134b609-78d4-4f07-bd4a-7057ba9b5adf"
915
+ },
916
+ "id": "SUOIS04idMXB",
917
+ "execution_count": null,
918
+ "outputs": [
919
+ {
920
+ "output_type": "stream",
921
+ "name": "stdout",
922
+ "text": [
923
+ "πŸ”Ή Using device: cuda\n"
924
+ ]
925
+ },
926
+ {
927
+ "output_type": "stream",
928
+ "name": "stderr",
929
+ "text": [
930
+ "Epoch 1/1: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3846/3846 [06:02<00:00, 10.62it/s, loss=0.861]\n"
931
+ ]
932
+ },
933
+ {
934
+ "output_type": "stream",
935
+ "name": "stdout",
936
+ "text": [
937
+ "Epoch [1/1], Loss: 0.9374\n",
938
+ "βœ… Model saved: checkpoints/transformer_epoch_1.pth\n",
939
+ "πŸ”Ή Example Prediction (Pseudocode β†’ C++): int x int x , int x , int x , int x , int x , int x , int x , int x , int x , int x , int x , int x , int x ; <end>\n",
940
+ "\n"
941
+ ]
942
+ }
943
+ ]
944
+ },
945
+ {
946
+ "cell_type": "code",
947
+ "source": [
948
+ "example_pseudocode = [\"for\", \"i\", \"=\", \"0\", \"to\", \"size\", \"of\", \"ans\", \"exclusive\", \",\", \"print\", \"ans\", \"[\", \"i\", \"]\", \"print\", \"newline\"]\n",
949
+ "translated_code = translate(model, example_pseudocode, vocab, device)\n",
950
+ "print(f\"πŸ”Ή Example Prediction (Pseudocode β†’ C++): {translated_code}\\n\")"
951
+ ],
952
+ "metadata": {
953
+ "colab": {
954
+ "base_uri": "https://localhost:8080/"
955
+ },
956
+ "id": "D5pxkdcUL5Gd",
957
+ "outputId": "c479d4a4-cfde-48cb-d0e7-22c74db2a500"
958
+ },
959
+ "id": "D5pxkdcUL5Gd",
960
+ "execution_count": null,
961
+ "outputs": [
962
+ {
963
+ "output_type": "stream",
964
+ "name": "stdout",
965
+ "text": [
966
+ "πŸ”Ή Example Prediction (Pseudocode β†’ C++): for ( int i = 0 ; i < ( ans ) ; i++ ) { cout < < < < ( ans [ i ] .size ( ans [ i ] .size ( ans [ i ] .size ( ans [ i ] , 0 ] ) ] .size\n",
967
+ "\n"
968
+ ]
969
+ }
970
+ ]
971
+ },
972
+ {
973
+ "cell_type": "code",
974
+ "source": [
975
+ "for (int i = 0; i < ans.size(); i++) { cout << ans[i] << endl; }"
976
+ ],
977
+ "metadata": {
978
+ "colab": {
979
+ "base_uri": "https://localhost:8080/"
980
+ },
981
+ "id": "hh7c0AziPqG5",
982
+ "outputId": "4f44229a-5778-4a2c-e368-680f387a6589"
983
+ },
984
+ "id": "hh7c0AziPqG5",
985
+ "execution_count": null,
986
+ "outputs": [
987
+ {
988
+ "output_type": "stream",
989
+ "name": "stdout",
990
+ "text": [
991
+ "πŸ”Ή Using device: cuda\n"
992
+ ]
993
+ },
994
+ {
995
+ "output_type": "execute_result",
996
+ "data": {
997
+ "text/plain": [
998
+ "PseudoCodeTransformer(\n",
999
+ " (embedding): Embedding(12006, 256)\n",
1000
+ " (positional_encoding): PositionalEncoding()\n",
1001
+ " (transformer): Transformer(\n",
1002
+ " (encoder): TransformerEncoder(\n",
1003
+ " (layers): ModuleList(\n",
1004
+ " (0-1): 2 x TransformerEncoderLayer(\n",
1005
+ " (self_attn): MultiheadAttention(\n",
1006
+ " (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)\n",
1007
+ " )\n",
1008
+ " (linear1): Linear(in_features=256, out_features=512, bias=True)\n",
1009
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
1010
+ " (linear2): Linear(in_features=512, out_features=256, bias=True)\n",
1011
+ " (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
1012
+ " (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
1013
+ " (dropout1): Dropout(p=0.1, inplace=False)\n",
1014
+ " (dropout2): Dropout(p=0.1, inplace=False)\n",
1015
+ " )\n",
1016
+ " )\n",
1017
+ " (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
1018
+ " )\n",
1019
+ " (decoder): TransformerDecoder(\n",
1020
+ " (layers): ModuleList(\n",
1021
+ " (0-1): 2 x TransformerDecoderLayer(\n",
1022
+ " (self_attn): MultiheadAttention(\n",
1023
+ " (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)\n",
1024
+ " )\n",
1025
+ " (multihead_attn): MultiheadAttention(\n",
1026
+ " (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)\n",
1027
+ " )\n",
1028
+ " (linear1): Linear(in_features=256, out_features=512, bias=True)\n",
1029
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
1030
+ " (linear2): Linear(in_features=512, out_features=256, bias=True)\n",
1031
+ " (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
1032
+ " (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
1033
+ " (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
1034
+ " (dropout1): Dropout(p=0.1, inplace=False)\n",
1035
+ " (dropout2): Dropout(p=0.1, inplace=False)\n",
1036
+ " (dropout3): Dropout(p=0.1, inplace=False)\n",
1037
+ " )\n",
1038
+ " )\n",
1039
+ " (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
1040
+ " )\n",
1041
+ " )\n",
1042
+ " (fc_out): Linear(in_features=256, out_features=12006, bias=True)\n",
1043
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
1044
+ ")"
1045
+ ]
1046
+ },
1047
+ "metadata": {},
1048
+ "execution_count": 8
1049
+ }
1050
+ ]
1051
+ },
1052
+ {
1053
+ "cell_type": "code",
1054
+ "source": [
1055
+ "# Load the trained model\n",
1056
+ "\n",
1057
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
1058
+ "print(f\"πŸ”Ή Using device: {device}\")\n",
1059
+ "\n",
1060
+ "# Move model to device\n",
1061
+ "model.to(device)\n",
1062
+ "\n",
1063
+ "model = PseudoCodeTransformer(config).to(device)\n",
1064
+ "model.load_state_dict(torch.load(\"/content/transformer_epoch_1.pth\", map_location=device))\n",
1065
+ "model.eval()\n",
1066
+ "\n",
1067
+ "# Run translation on example pseudocode\n",
1068
+ "example_pseudocode = [\"for\", \"i\", \"=\", \"0\", \"to\", \"size\", \"of\", \"ans\", \"exclusive\", \",\", \"print\", \"ans\", \"[\", \"i\", \"]\", \"print\", \"newline\"]\n",
1069
+ "translated_code = translate(model, example_pseudocode, vocab, device)\n",
1070
+ "\n",
1071
+ "print(f\"πŸ”Ή Example Prediction (Pseudocode β†’ C++): {translated_code}\\n\")\n"
1072
+ ],
1073
+ "metadata": {
1074
+ "id": "t-xzQokaPy_E",
1075
+ "colab": {
1076
+ "base_uri": "https://localhost:8080/"
1077
+ },
1078
+ "outputId": "9bdff1cd-1094-4ba8-df9f-9cbde4a628bd"
1079
+ },
1080
+ "id": "t-xzQokaPy_E",
1081
+ "execution_count": 8,
1082
+ "outputs": [
1083
+ {
1084
+ "output_type": "stream",
1085
+ "name": "stdout",
1086
+ "text": [
1087
+ "πŸ”Ή Using device: cuda\n"
1088
+ ]
1089
+ },
1090
+ {
1091
+ "output_type": "stream",
1092
+ "name": "stderr",
1093
+ "text": [
1094
+ "<ipython-input-8-057fb80d4514>:10: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
1095
+ " model.load_state_dict(torch.load(\"/content/transformer_epoch_1.pth\", map_location=device))\n"
1096
+ ]
1097
+ },
1098
+ {
1099
+ "output_type": "stream",
1100
+ "name": "stdout",
1101
+ "text": [
1102
+ "πŸ”Ή Example Prediction (Pseudocode β†’ C++): for ( int i = 0 ; i < ( ans ) ; i++ ) { cout < < < < ( ans [ i ] .size ( ans [ i ] .size ( ans [ i ] .size ( ans [ i ] , 0 ] ) ] .size\n",
1103
+ "\n"
1104
+ ]
1105
+ }
1106
+ ]
1107
+ },
1108
+ {
1109
+ "cell_type": "markdown",
1110
+ "source": [
1111
+ "**C++ CODE TO PSEUDOCODE**"
1112
+ ],
1113
+ "metadata": {
1114
+ "id": "CrE_8fkdRdfQ"
1115
+ },
1116
+ "id": "CrE_8fkdRdfQ"
1117
+ },
1118
+ {
1119
+ "cell_type": "code",
1120
+ "source": [
1121
+ "import torch\n",
1122
+ "import torch.nn as nn\n",
1123
+ "import torch.optim as optim\n",
1124
+ "import math\n",
1125
+ "import pandas as pd\n",
1126
+ "import ast\n",
1127
+ "import json\n",
1128
+ "from torch.utils.data import DataLoader, Dataset\n",
1129
+ "from torch.nn.utils.rnn import pad_sequence\n",
1130
+ "from tqdm import tqdm\n",
1131
+ "import os\n",
1132
+ "\n",
1133
+ "# Transformer Hyperparameters\n",
1134
+ "class Config:\n",
1135
+ " vocab_size = 12006 # Adjust based on vocabulary.json\n",
1136
+ " max_length = 100\n",
1137
+ " embed_dim = 256\n",
1138
+ " num_heads = 8\n",
1139
+ " num_layers = 2\n",
1140
+ " feedforward_dim = 512\n",
1141
+ " dropout = 0.1\n",
1142
+ " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
1143
+ "\n",
1144
+ "config = Config()\n",
1145
+ "\n",
1146
+ "# Positional Encoding\n",
1147
+ "class PositionalEncoding(nn.Module):\n",
1148
+ " def __init__(self, embed_dim, max_len=100):\n",
1149
+ " super(PositionalEncoding, self).__init__()\n",
1150
+ " pe = torch.zeros(max_len, embed_dim)\n",
1151
+ " position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n",
1152
+ " div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))\n",
1153
+ " pe[:, 0::2] = torch.sin(position * div_term)\n",
1154
+ " pe[:, 1::2] = torch.cos(position * div_term)\n",
1155
+ " self.pe = pe.unsqueeze(0)\n",
1156
+ "\n",
1157
+ " def forward(self, x):\n",
1158
+ " return x + self.pe[:, :x.size(1)].to(x.device)\n",
1159
+ "\n",
1160
+ "# Transformer Model\n",
1161
+ "class CPPtoPseudoTransformer(nn.Module):\n",
1162
+ " def __init__(self, config):\n",
1163
+ " super(CPPtoPseudoTransformer, self).__init__()\n",
1164
+ " self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)\n",
1165
+ " self.positional_encoding = PositionalEncoding(config.embed_dim, config.max_length)\n",
1166
+ "\n",
1167
+ " self.transformer = nn.Transformer(\n",
1168
+ " d_model=config.embed_dim,\n",
1169
+ " nhead=config.num_heads,\n",
1170
+ " num_encoder_layers=config.num_layers,\n",
1171
+ " num_decoder_layers=config.num_layers,\n",
1172
+ " dim_feedforward=config.feedforward_dim,\n",
1173
+ " dropout=config.dropout\n",
1174
+ " )\n",
1175
+ "\n",
1176
+ " self.fc_out = nn.Linear(config.embed_dim, config.vocab_size)\n",
1177
+ " self.dropout = nn.Dropout(config.dropout)\n",
1178
+ "\n",
1179
+ " def generate_square_subsequent_mask(self, sz):\n",
1180
+ " return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(config.device)\n",
1181
+ "\n",
1182
+ " def forward(self, src, tgt):\n",
1183
+ " src_emb = self.embedding(src) * math.sqrt(config.embed_dim)\n",
1184
+ " tgt_emb = self.embedding(tgt) * math.sqrt(config.embed_dim)\n",
1185
+ "\n",
1186
+ " src_emb = self.positional_encoding(src_emb)\n",
1187
+ " tgt_emb = self.positional_encoding(tgt_emb)\n",
1188
+ "\n",
1189
+ " src_mask = self.generate_square_subsequent_mask(src.size(1))\n",
1190
+ " tgt_mask = self.generate_square_subsequent_mask(tgt.size(1))\n",
1191
+ "\n",
1192
+ " out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),\n",
1193
+ " src_mask=src_mask, tgt_mask=tgt_mask)\n",
1194
+ "\n",
1195
+ " out = self.fc_out(out.permute(1, 0, 2))\n",
1196
+ " return out\n",
1197
+ "\n",
1198
+ "# Initialize Model\n",
1199
+ "model = CPPtoPseudoTransformer(config).to(config.device)\n",
1200
+ "print(\"πŸš€ C++ β†’ Pseudocode Model initialized!\")\n"
1201
+ ],
1202
+ "metadata": {
1203
+ "colab": {
1204
+ "base_uri": "https://localhost:8080/"
1205
+ },
1206
+ "id": "yE-UUuDGI5Az",
1207
+ "outputId": "8513b586-2c83-4a62-f86a-5bb00f3247a9"
1208
+ },
1209
+ "id": "yE-UUuDGI5Az",
1210
+ "execution_count": 17,
1211
+ "outputs": [
1212
+ {
1213
+ "output_type": "stream",
1214
+ "name": "stderr",
1215
+ "text": [
1216
+ "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
1217
+ " warnings.warn(\n"
1218
+ ]
1219
+ },
1220
+ {
1221
+ "output_type": "stream",
1222
+ "name": "stdout",
1223
+ "text": [
1224
+ "πŸš€ C++ β†’ Pseudocode Model initialized!\n"
1225
+ ]
1226
+ }
1227
+ ]
1228
+ },
1229
+ {
1230
+ "cell_type": "code",
1231
+ "source": [
1232
+ "\n",
1233
+ "# Load Vocabulary\n",
1234
+ "with open(\"vocabulary.json\", \"r\") as f:\n",
1235
+ " vocab = json.load(f)\n",
1236
+ "\n",
1237
+ "print(f\"βœ… Vocabulary loaded with {len(vocab)} tokens\")\n",
1238
+ "\n"
1239
+ ],
1240
+ "metadata": {
1241
+ "colab": {
1242
+ "base_uri": "https://localhost:8080/"
1243
+ },
1244
+ "id": "CdWsUGr4KHM1",
1245
+ "outputId": "b88217be-45e8-4c85-eed9-679b54627848"
1246
+ },
1247
+ "id": "CdWsUGr4KHM1",
1248
+ "execution_count": 18,
1249
+ "outputs": [
1250
+ {
1251
+ "output_type": "stream",
1252
+ "name": "stdout",
1253
+ "text": [
1254
+ "βœ… Vocabulary loaded with 12006 tokens\n"
1255
+ ]
1256
+ }
1257
+ ]
1258
+ },
1259
+ {
1260
+ "cell_type": "code",
1261
+ "source": [
1262
+ "def translate2(model, cpp_tokens, vocab, device, max_length=50):\n",
1263
+ " model.eval()\n",
1264
+ "\n",
1265
+ " # Convert C++ tokens to numerical indices\n",
1266
+ " input_ids = [vocab.get(token, vocab[\"<unk>\"]) for token in cpp_tokens]\n",
1267
+ " input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device) # Add batch dimension\n",
1268
+ "\n",
1269
+ " output_ids = [vocab[\"<start>\"]]\n",
1270
+ "\n",
1271
+ " for _ in range(max_length):\n",
1272
+ " output_tensor = torch.tensor(output_ids, dtype=torch.long).unsqueeze(0).to(device)\n",
1273
+ "\n",
1274
+ " # Get model predictions\n",
1275
+ " with torch.no_grad():\n",
1276
+ " predictions = model(input_tensor, output_tensor)\n",
1277
+ "\n",
1278
+ " # Select the most probable token\n",
1279
+ " next_token_id = predictions.argmax(dim=-1)[:, -1].item()\n",
1280
+ "\n",
1281
+ " if next_token_id == vocab[\"<pad>\"]: # Ignore <pad> tokens\n",
1282
+ " continue\n",
1283
+ "\n",
1284
+ " output_ids.append(next_token_id)\n",
1285
+ "\n",
1286
+ " if next_token_id == vocab[\"<end>\"]: # Stop if <end> is generated\n",
1287
+ " break\n",
1288
+ "\n",
1289
+ " # Convert token indices back to words\n",
1290
+ " id_to_token = {idx: token for token, idx in vocab.items()}\n",
1291
+ " translated_pseudocode = [id_to_token.get(idx, \"<unk>\") for idx in output_ids[1:]] # Exclude <start>\n",
1292
+ "\n",
1293
+ " return \" \".join(translated_pseudocode)\n"
1294
+ ],
1295
+ "metadata": {
1296
+ "id": "BEEQ_zNHKQRY"
1297
+ },
1298
+ "id": "BEEQ_zNHKQRY",
1299
+ "execution_count": 35,
1300
+ "outputs": []
1301
+ },
1302
+ {
1303
+ "cell_type": "code",
1304
+ "source": [
1305
+ "# Dataset Class\n",
1306
+ "class CPPToPseudoDataset(Dataset):\n",
1307
+ " def __init__(self, file_path):\n",
1308
+ " df = pd.read_csv(file_path)\n",
1309
+ " self.inputs = [ast.literal_eval(x) for x in df['code_sequences']]\n",
1310
+ " self.outputs = [ast.literal_eval(x) for x in df['text_sequences']]\n",
1311
+ "\n",
1312
+ " def __len__(self):\n",
1313
+ " return len(self.inputs)\n",
1314
+ "\n",
1315
+ " def __getitem__(self, idx):\n",
1316
+ " input_tensor = torch.tensor(self.inputs[idx], dtype=torch.int64)\n",
1317
+ " output_tensor = torch.tensor([vocab[\"<start>\"]] + self.outputs[idx] + [vocab[\"<end>\"]], dtype=torch.int64)\n",
1318
+ " return input_tensor, output_tensor\n",
1319
+ "\n",
1320
+ "# Padding Function\n",
1321
+ "def Add_Pad(batch):\n",
1322
+ " inputs, outputs = zip(*batch)\n",
1323
+ " inputs = pad_sequence(inputs, batch_first=True, padding_value=0)\n",
1324
+ " outputs = pad_sequence(outputs, batch_first=True, padding_value=0)\n",
1325
+ " return inputs, outputs\n",
1326
+ "\n",
1327
+ "# Load Dataset\n",
1328
+ "dataset = CPPToPseudoDataset(\"tokenized_sequences.csv\")\n",
1329
+ "dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=Add_Pad)\n",
1330
+ "\n",
1331
+ "print(f\"βœ… Loaded {len(dataset)} examples for training\")\n",
1332
+ "\n",
1333
+ "# Training Configuration\n",
1334
+ "criterion = nn.CrossEntropyLoss(ignore_index=vocab[\"<pad>\"])\n",
1335
+ "optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-3)\n",
1336
+ "\n",
1337
+ "# Create directory to save models\n",
1338
+ "os.makedirs(\"checkpoints\", exist_ok=True)\n"
1339
+ ],
1340
+ "metadata": {
1341
+ "colab": {
1342
+ "base_uri": "https://localhost:8080/"
1343
+ },
1344
+ "id": "t3HIGFZCKNaB",
1345
+ "outputId": "f9ac82d1-ff8a-45bc-aa06-9e78738a4021"
1346
+ },
1347
+ "id": "t3HIGFZCKNaB",
1348
+ "execution_count": 34,
1349
+ "outputs": [
1350
+ {
1351
+ "output_type": "stream",
1352
+ "name": "stdout",
1353
+ "text": [
1354
+ "βœ… Loaded 246086 examples for training\n"
1355
+ ]
1356
+ }
1357
+ ]
1358
+ },
1359
+ {
1360
+ "cell_type": "code",
1361
+ "source": [
1362
+ "# Training Loop\n",
1363
+ "num_epochs = 1\n",
1364
+ "for epoch in range(num_epochs):\n",
1365
+ " model.train()\n",
1366
+ " epoch_loss = 0\n",
1367
+ "\n",
1368
+ " progress_bar = tqdm(dataloader, desc=f\"Epoch {epoch+1}/{num_epochs}\")\n",
1369
+ " for batch in progress_bar:\n",
1370
+ " src, tgt = batch\n",
1371
+ " src, tgt = src.to(config.device), tgt.to(config.device)\n",
1372
+ "\n",
1373
+ " tgt_input = tgt[:, :-1]\n",
1374
+ " tgt_output = tgt[:, 1:]\n",
1375
+ "\n",
1376
+ " optimizer.zero_grad()\n",
1377
+ " output = model(src, tgt_input)\n",
1378
+ "\n",
1379
+ " loss = criterion(output.view(-1, config.vocab_size), tgt_output.contiguous().view(-1))\n",
1380
+ " loss.backward()\n",
1381
+ " optimizer.step()\n",
1382
+ "\n",
1383
+ " epoch_loss += loss.item()\n",
1384
+ " progress_bar.set_postfix(loss=loss.item())\n",
1385
+ "\n",
1386
+ " avg_loss = epoch_loss / len(dataloader)\n",
1387
+ " print(f\"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}\")\n",
1388
+ "\n",
1389
+ " # Save Model Checkpoint\n",
1390
+ " torch.save(model.state_dict(), f\"checkpoints/cpp_to_pseudo_epoch_{epoch+1}.pth\")\n",
1391
+ " print(f\"βœ… Model saved: checkpoints/cpp_to_pseudo_epoch_{epoch+1}.pth\")\n",
1392
+ "\n",
1393
+ " # Print Example Prediction\n",
1394
+ " model.eval()\n",
1395
+ " example_cpp = [\"int\", \"main\", \"(\", \")\", \"{\", \"return\", \"0\", \";\", \"}\"]\n",
1396
+ " translated_pseudocode = translate2(model, example_cpp, vocab, config.device)\n",
1397
+ " print(f\"πŸ”Ή Example Prediction (C++ β†’ Pseudocode): {translated_pseudocode}\\n\")\n"
1398
+ ],
1399
+ "metadata": {
1400
+ "colab": {
1401
+ "base_uri": "https://localhost:8080/"
1402
+ },
1403
+ "id": "QXd9vz5EKT8u",
1404
+ "outputId": "692ad75d-6c39-4f68-e577-7bc88d4ccea3"
1405
+ },
1406
+ "id": "QXd9vz5EKT8u",
1407
+ "execution_count": 36,
1408
+ "outputs": [
1409
+ {
1410
+ "output_type": "stream",
1411
+ "name": "stderr",
1412
+ "text": [
1413
+ "Epoch 1/1: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 3846/3846 [06:10<00:00, 10.38it/s, loss=1.84]\n"
1414
+ ]
1415
+ },
1416
+ {
1417
+ "output_type": "stream",
1418
+ "name": "stdout",
1419
+ "text": [
1420
+ "Epoch [1/1], Loss: 0.9463\n",
1421
+ "βœ… Model saved: checkpoints/cpp_to_pseudo_epoch_1.pth\n",
1422
+ "πŸ”Ή Example Prediction (C++ β†’ Pseudocode): nan return 0 from function return 0 to int function nan return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 from function return 0 return 0 return 0 return\n",
1423
+ "\n"
1424
+ ]
1425
+ }
1426
+ ]
1427
+ },
1428
+ {
1429
+ "cell_type": "code",
1430
+ "source": [
1431
+ "# Load the trained model\n",
1432
+ "\n",
1433
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
1434
+ "print(f\"πŸ”Ή Using device: {device}\")\n",
1435
+ "\n",
1436
+ "# Move model to device\n",
1437
+ "model.to(device)\n",
1438
+ "\n",
1439
+ "model = PseudoCodeTransformer(config).to(device)\n",
1440
+ "model.load_state_dict(torch.load(\"/content/checkpoints/cpp_to_pseudo_epoch_1.pth\", map_location=device))\n",
1441
+ "model.eval()\n",
1442
+ "\n",
1443
+ "example_cpp = [\"int\", \"a\",\"=\", \"10\",\";\"]\n",
1444
+ "translated_pseudocode = translate2(model, example_cpp, vocab, config.device)\n",
1445
+ "print(f\"πŸ”Ή Example Prediction (C++ β†’ Pseudocode): {translated_pseudocode}\\n\")\n"
1446
+ ],
1447
+ "metadata": {
1448
+ "colab": {
1449
+ "base_uri": "https://localhost:8080/"
1450
+ },
1451
+ "id": "tfjIcOTkK33z",
1452
+ "outputId": "2e481125-7130-4e79-c213-528162a252e8"
1453
+ },
1454
+ "id": "tfjIcOTkK33z",
1455
+ "execution_count": 40,
1456
+ "outputs": [
1457
+ {
1458
+ "output_type": "stream",
1459
+ "name": "stdout",
1460
+ "text": [
1461
+ "πŸ”Ή Using device: cuda\n"
1462
+ ]
1463
+ },
1464
+ {
1465
+ "output_type": "stream",
1466
+ "name": "stderr",
1467
+ "text": [
1468
+ "<ipython-input-40-3042f1a40ae4>:10: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
1469
+ " model.load_state_dict(torch.load(\"/content/checkpoints/cpp_to_pseudo_epoch_1.pth\", map_location=device))\n"
1470
+ ]
1471
+ },
1472
+ {
1473
+ "output_type": "stream",
1474
+ "name": "stdout",
1475
+ "text": [
1476
+ "πŸ”Ή Example Prediction (C++ β†’ Pseudocode): create integer a a with a = 10 a = 10 10 integer with a = 10 10 10 10 a 10 a 10 a 10 a 10 a 10 a 10 a 10 integer with a = 10 a 10 a 10 a 10 a 10 a 10 a\n",
1477
+ "\n"
1478
+ ]
1479
+ }
1480
+ ]
1481
+ },
1482
+ {
1483
+ "cell_type": "code",
1484
+ "source": [],
1485
+ "metadata": {
1486
+ "id": "vBmHMGCJM3Cw"
1487
+ },
1488
+ "id": "vBmHMGCJM3Cw",
1489
+ "execution_count": null,
1490
+ "outputs": []
1491
+ }
1492
+ ],
1493
+ "metadata": {
1494
+ "kernelspec": {
1495
+ "display_name": "Python 3",
1496
+ "name": "python3"
1497
+ },
1498
+ "language_info": {
1499
+ "codemirror_mode": {
1500
+ "name": "ipython",
1501
+ "version": 3
1502
+ },
1503
+ "file_extension": ".py",
1504
+ "mimetype": "text/x-python",
1505
+ "name": "python",
1506
+ "nbconvert_exporter": "python",
1507
+ "pygments_lexer": "ipython3",
1508
+ "version": "3.11.7"
1509
+ },
1510
+ "colab": {
1511
+ "provenance": [],
1512
+ "gpuType": "T4"
1513
+ },
1514
+ "accelerator": "GPU"
1515
+ },
1516
+ "nbformat": 4,
1517
+ "nbformat_minor": 5
1518
+ }
transformer_epoch_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a5266dc7e1a780031bc3f7b90ad9d5b301120d03e3dc6c8fc0dbd1ccbeeefc7
3
+ size 40493679
vocabulary.json ADDED
The diff for this file is too large to render. See raw diff