Spaces:
Sleeping
Sleeping
uploaded project
Browse files- .gitattributes +3 -35
- app.py +169 -0
- cpp_to_pseudo_epoch_1.pth +3 -0
- requirements.txt +6 -0
- transformer.ipynb +1518 -0
- transformer_epoch_1.pth +3 -0
- vocabulary.json +0 -0
.gitattributes
CHANGED
@@ -1,35 +1,3 @@
|
|
1 |
-
*.
|
2 |
-
|
3 |
-
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.mov filter=lfs diff=lfs merge=lfs -text
|
2 |
+
cpp_to_pseudo_epoch_1.pth filter=lfs diff=lfs merge=lfs -text
|
3 |
+
transformer_epoch_1.pth filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import json
|
5 |
+
import math
|
6 |
+
|
7 |
+
# Configure the page for a wide layout.
|
8 |
+
st.set_page_config(page_title="Code Conversion Tool", layout="wide")
|
9 |
+
|
10 |
+
# Inject custom CSS for a modern, centered card design with a gradient background.
|
11 |
+
st.markdown(
|
12 |
+
"""
|
13 |
+
<style>
|
14 |
+
/* Set a subtle gradient background for the page */
|
15 |
+
body {
|
16 |
+
background: linear-gradient(135deg, #ece9e6, #ffffff);
|
17 |
+
font-family: 'Helvetica Neue', sans-serif;
|
18 |
+
}
|
19 |
+
/* Center container for the main app */
|
20 |
+
.main-container {
|
21 |
+
max-width: 800px;
|
22 |
+
margin: 3rem auto;
|
23 |
+
padding: 1rem;
|
24 |
+
}
|
25 |
+
/* Card style for a clean content box */
|
26 |
+
.card {
|
27 |
+
background: #ffffff;
|
28 |
+
border-radius: 10px;
|
29 |
+
box-shadow: 0px 4px 8px rgba(0,0,0,0.1);
|
30 |
+
padding: 2rem;
|
31 |
+
}
|
32 |
+
/* Center headings and remove underline */
|
33 |
+
h1, h2, h3 {
|
34 |
+
text-align: center;
|
35 |
+
text-decoration: none;
|
36 |
+
}
|
37 |
+
/* Style for the translation button */
|
38 |
+
.stButton>button {
|
39 |
+
background-color: #4CAF50;
|
40 |
+
color: white;
|
41 |
+
border: none;
|
42 |
+
padding: 0.5rem 1.5rem;
|
43 |
+
border-radius: 5px;
|
44 |
+
font-size: 1rem;
|
45 |
+
cursor: pointer;
|
46 |
+
}
|
47 |
+
.stButton>button:hover {
|
48 |
+
background-color: #45a049;
|
49 |
+
}
|
50 |
+
</style>
|
51 |
+
""",
|
52 |
+
unsafe_allow_html=True
|
53 |
+
)
|
54 |
+
|
55 |
+
# Wrap the app content in a centered container.
|
56 |
+
with st.container():
|
57 |
+
# Change the title here.
|
58 |
+
st.title("Code Conversion Tool")
|
59 |
+
|
60 |
+
# Load vocabulary directly (no sidebar)
|
61 |
+
with open("vocabulary.json", "r") as f:
|
62 |
+
vocab = json.load(f)
|
63 |
+
|
64 |
+
# Define separate configuration classes
|
65 |
+
class PseudoToCppConfig:
|
66 |
+
# Config for Pseudocode β C++ model
|
67 |
+
vocab_size = 12006
|
68 |
+
max_length = 100
|
69 |
+
embed_dim = 256
|
70 |
+
num_heads = 4
|
71 |
+
num_layers = 3
|
72 |
+
feedforward_dim = 512
|
73 |
+
dropout = 0.2
|
74 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
75 |
+
|
76 |
+
class CppToPseudoConfig:
|
77 |
+
# Config for C++ β Pseudocode model
|
78 |
+
vocab_size = 12006
|
79 |
+
max_length = 100
|
80 |
+
embed_dim = 256
|
81 |
+
num_heads = 8
|
82 |
+
num_layers = 2
|
83 |
+
feedforward_dim = 512
|
84 |
+
dropout = 0.1
|
85 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
86 |
+
|
87 |
+
# Positional Encoding
|
88 |
+
class PositionalEncoding(nn.Module):
|
89 |
+
def __init__(self, embed_dim, max_len=100):
|
90 |
+
super(PositionalEncoding, self).__init__()
|
91 |
+
pe = torch.zeros(max_len, embed_dim)
|
92 |
+
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
93 |
+
div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
|
94 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
95 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
96 |
+
self.pe = pe.unsqueeze(0)
|
97 |
+
|
98 |
+
def forward(self, x):
|
99 |
+
return x + self.pe[:, :x.size(1)].to(x.device)
|
100 |
+
|
101 |
+
# Transformer Model
|
102 |
+
class Seq2SeqTransformer(nn.Module):
|
103 |
+
def __init__(self, config):
|
104 |
+
super(Seq2SeqTransformer, self).__init__()
|
105 |
+
self.config = config
|
106 |
+
self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)
|
107 |
+
self.positional_encoding = PositionalEncoding(config.embed_dim, config.max_length)
|
108 |
+
self.transformer = nn.Transformer(
|
109 |
+
d_model=config.embed_dim,
|
110 |
+
nhead=config.num_heads,
|
111 |
+
num_encoder_layers=config.num_layers,
|
112 |
+
num_decoder_layers=config.num_layers,
|
113 |
+
dim_feedforward=config.feedforward_dim,
|
114 |
+
dropout=config.dropout
|
115 |
+
)
|
116 |
+
self.fc_out = nn.Linear(config.embed_dim, config.vocab_size)
|
117 |
+
|
118 |
+
def forward(self, src, tgt):
|
119 |
+
src_emb = self.embedding(src) * math.sqrt(self.config.embed_dim)
|
120 |
+
tgt_emb = self.embedding(tgt) * math.sqrt(self.config.embed_dim)
|
121 |
+
src_emb = self.positional_encoding(src_emb)
|
122 |
+
tgt_emb = self.positional_encoding(tgt_emb)
|
123 |
+
out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2))
|
124 |
+
out = self.fc_out(out.permute(1, 0, 2))
|
125 |
+
return out
|
126 |
+
|
127 |
+
# Load Models with the appropriate configuration
|
128 |
+
@st.cache_resource
|
129 |
+
def load_model(path, config):
|
130 |
+
model = Seq2SeqTransformer(config).to(config.device)
|
131 |
+
model.load_state_dict(torch.load(path, map_location=config.device))
|
132 |
+
model.eval()
|
133 |
+
return model
|
134 |
+
|
135 |
+
cpp_to_pseudo_model = load_model("cpp_to_pseudo_epoch_1.pth", CppToPseudoConfig)
|
136 |
+
pseudo_to_cpp_model = load_model("transformer_epoch_1.pth", PseudoToCppConfig)
|
137 |
+
|
138 |
+
# Translation Function
|
139 |
+
def translate(model, input_tokens, vocab, device, max_length=50):
|
140 |
+
model.eval()
|
141 |
+
input_ids = [vocab.get(token, vocab["<unk>"]) for token in input_tokens]
|
142 |
+
input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)
|
143 |
+
output_ids = [vocab["<start>"]]
|
144 |
+
for _ in range(max_length):
|
145 |
+
output_tensor = torch.tensor(output_ids, dtype=torch.long).unsqueeze(0).to(device)
|
146 |
+
with torch.no_grad():
|
147 |
+
predictions = model(input_tensor, output_tensor)
|
148 |
+
next_token_id = predictions.argmax(dim=-1)[:, -1].item()
|
149 |
+
output_ids.append(next_token_id)
|
150 |
+
if next_token_id == vocab["<end>"]:
|
151 |
+
break
|
152 |
+
id_to_token = {idx: token for token, idx in vocab.items()}
|
153 |
+
return " ".join([id_to_token.get(idx, "<unk>") for idx in output_ids[1:]])
|
154 |
+
|
155 |
+
# UI Elements for Translation
|
156 |
+
mode = st.radio("Select Translation Mode", ("C++ β Pseudocode", "Pseudocode β C++"))
|
157 |
+
user_input = st.text_area("Enter code:")
|
158 |
+
|
159 |
+
if st.button("Translate"):
|
160 |
+
tokens = user_input.strip().split()
|
161 |
+
if mode == "C++ β Pseudocode":
|
162 |
+
translated_code = translate(cpp_to_pseudo_model, tokens, vocab, CppToPseudoConfig.device)
|
163 |
+
else:
|
164 |
+
translated_code = translate(pseudo_to_cpp_model, tokens, vocab, PseudoToCppConfig.device)
|
165 |
+
st.subheader("Generated Translation:")
|
166 |
+
st.code(translated_code, language="cpp" if mode == "Pseudocode β C++" else "python")
|
167 |
+
|
168 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
169 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
cpp_to_pseudo_epoch_1.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:98ef6b0c38eeb68e6258f24aae44773996b89b3c624563369119004c5261c992
|
3 |
+
size 35210415
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
torch
|
3 |
+
torchvision
|
4 |
+
torchaudio
|
5 |
+
numpy
|
6 |
+
pandas
|
transformer.ipynb
ADDED
@@ -0,0 +1,1518 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "4ae62f55",
|
6 |
+
"metadata": {
|
7 |
+
"id": "4ae62f55"
|
8 |
+
},
|
9 |
+
"source": [
|
10 |
+
"# LOAD DATA"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": null,
|
16 |
+
"id": "fdf35712",
|
17 |
+
"metadata": {
|
18 |
+
"colab": {
|
19 |
+
"base_uri": "https://localhost:8080/"
|
20 |
+
},
|
21 |
+
"id": "fdf35712",
|
22 |
+
"outputId": "aba7cfe3-c992-452c-db12-378eea703c32"
|
23 |
+
},
|
24 |
+
"outputs": [
|
25 |
+
{
|
26 |
+
"output_type": "stream",
|
27 |
+
"name": "stderr",
|
28 |
+
"text": [
|
29 |
+
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
|
30 |
+
"[nltk_data] Package punkt is already up-to-date!\n",
|
31 |
+
"[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
|
32 |
+
"[nltk_data] Package punkt_tab is already up-to-date!\n",
|
33 |
+
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
|
34 |
+
"[nltk_data] Package punkt is already up-to-date!\n"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"output_type": "stream",
|
39 |
+
"name": "stdout",
|
40 |
+
"text": [
|
41 |
+
" text code workerid probid \\\n",
|
42 |
+
"0 NaN int main() { 1 3A \n",
|
43 |
+
"1 create string s string s; 1 3A \n",
|
44 |
+
"2 create integers x1, y1, x2, y2 int x1, y1, x2, y2; 1 3A \n",
|
45 |
+
"3 read s cin >> s; 1 3A \n",
|
46 |
+
"4 set x1 to s[0] - 96 x1 = s[0] - 96; 1 3A \n",
|
47 |
+
"\n",
|
48 |
+
" subid line indent \n",
|
49 |
+
"0 41470897 0 0 \n",
|
50 |
+
"1 41470897 1 1 \n",
|
51 |
+
"2 41470897 2 1 \n",
|
52 |
+
"3 41470897 3 1 \n",
|
53 |
+
"4 41470897 4 1 \n"
|
54 |
+
]
|
55 |
+
}
|
56 |
+
],
|
57 |
+
"source": [
|
58 |
+
"import pandas as pd\n",
|
59 |
+
"import nltk\n",
|
60 |
+
"from nltk.tokenize import word_tokenize\n",
|
61 |
+
"import nltk\n",
|
62 |
+
"\n",
|
63 |
+
"# Download the 'punkt' tokenizer data\n",
|
64 |
+
"nltk.download('punkt')\n",
|
65 |
+
"\n",
|
66 |
+
"# If 'punkt_tab' is still missing, try:\n",
|
67 |
+
"nltk.download('punkt_tab')\n",
|
68 |
+
"\n",
|
69 |
+
"\n",
|
70 |
+
"# Download punkt tokenizer\n",
|
71 |
+
"nltk.download('punkt')\n",
|
72 |
+
"\n",
|
73 |
+
"\n",
|
74 |
+
"# Load dataset\n",
|
75 |
+
"file_path = r\"/content/spoc-train-train.csv\" # Change to your dataset's path\n",
|
76 |
+
"df = pd.read_csv(file_path)\n",
|
77 |
+
"\n",
|
78 |
+
"# Display dataset information\n",
|
79 |
+
"print(df.head())\n"
|
80 |
+
]
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"cell_type": "code",
|
84 |
+
"execution_count": null,
|
85 |
+
"id": "6889f401",
|
86 |
+
"metadata": {
|
87 |
+
"colab": {
|
88 |
+
"base_uri": "https://localhost:8080/"
|
89 |
+
},
|
90 |
+
"id": "6889f401",
|
91 |
+
"outputId": "55269b88-418b-4fd5-d4ed-065109f899e7"
|
92 |
+
},
|
93 |
+
"outputs": [
|
94 |
+
{
|
95 |
+
"output_type": "stream",
|
96 |
+
"name": "stdout",
|
97 |
+
"text": [
|
98 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
99 |
+
"RangeIndex: 246086 entries, 0 to 246085\n",
|
100 |
+
"Data columns (total 7 columns):\n",
|
101 |
+
" # Column Non-Null Count Dtype \n",
|
102 |
+
"--- ------ -------------- ----- \n",
|
103 |
+
" 0 text 181862 non-null object\n",
|
104 |
+
" 1 code 246086 non-null object\n",
|
105 |
+
" 2 workerid 246086 non-null int64 \n",
|
106 |
+
" 3 probid 246086 non-null object\n",
|
107 |
+
" 4 subid 246086 non-null int64 \n",
|
108 |
+
" 5 line 246086 non-null int64 \n",
|
109 |
+
" 6 indent 246086 non-null int64 \n",
|
110 |
+
"dtypes: int64(4), object(3)\n",
|
111 |
+
"memory usage: 13.1+ MB\n",
|
112 |
+
"None\n"
|
113 |
+
]
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"source": [
|
117 |
+
"print(df.info())\n"
|
118 |
+
]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"cell_type": "markdown",
|
122 |
+
"id": "dd3ad88e",
|
123 |
+
"metadata": {
|
124 |
+
"id": "dd3ad88e"
|
125 |
+
},
|
126 |
+
"source": [
|
127 |
+
"# DATA PREPROCESSING"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "code",
|
132 |
+
"execution_count": null,
|
133 |
+
"id": "3a08429f",
|
134 |
+
"metadata": {
|
135 |
+
"id": "3a08429f"
|
136 |
+
},
|
137 |
+
"outputs": [],
|
138 |
+
"source": [
|
139 |
+
"from nltk.tokenize import word_tokenize\n",
|
140 |
+
"\n",
|
141 |
+
"df[\"text\"] = df[\"text\"].astype(str)\n",
|
142 |
+
"df[\"text_tokens\"] = df[\"text\"].apply(word_tokenize)\n"
|
143 |
+
]
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"cell_type": "code",
|
147 |
+
"execution_count": null,
|
148 |
+
"id": "80e52203",
|
149 |
+
"metadata": {
|
150 |
+
"id": "80e52203"
|
151 |
+
},
|
152 |
+
"outputs": [],
|
153 |
+
"source": [
|
154 |
+
"df[\"text\"] = df[\"text\"].fillna(\"\") # Replace NaN with empty strings\n",
|
155 |
+
"df[\"text_tokens\"] = df[\"text\"].apply(word_tokenize)\n"
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"execution_count": null,
|
161 |
+
"id": "3e73cd24",
|
162 |
+
"metadata": {
|
163 |
+
"id": "3e73cd24"
|
164 |
+
},
|
165 |
+
"outputs": [],
|
166 |
+
"source": [
|
167 |
+
"df[\"text_tokens\"] = df[\"text\"].apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])\n"
|
168 |
+
]
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"cell_type": "code",
|
172 |
+
"execution_count": null,
|
173 |
+
"id": "123f02bd",
|
174 |
+
"metadata": {
|
175 |
+
"colab": {
|
176 |
+
"base_uri": "https://localhost:8080/"
|
177 |
+
},
|
178 |
+
"id": "123f02bd",
|
179 |
+
"outputId": "cf09d02c-e55b-4857-b46f-1aee035bab36"
|
180 |
+
},
|
181 |
+
"outputs": [
|
182 |
+
{
|
183 |
+
"output_type": "stream",
|
184 |
+
"name": "stdout",
|
185 |
+
"text": [
|
186 |
+
"object\n",
|
187 |
+
"0\n",
|
188 |
+
"Empty DataFrame\n",
|
189 |
+
"Columns: [text, code, workerid, probid, subid, line, indent, text_tokens]\n",
|
190 |
+
"Index: []\n"
|
191 |
+
]
|
192 |
+
}
|
193 |
+
],
|
194 |
+
"source": [
|
195 |
+
"print(df[\"text\"].dtype) # Check the column's data type\n",
|
196 |
+
"print(df[\"text\"].isna().sum()) # Count missing values\n",
|
197 |
+
"print(df[df[\"text\"].apply(lambda x: not isinstance(x, str))]) # Show non-string values\n"
|
198 |
+
]
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"cell_type": "markdown",
|
202 |
+
"id": "7666823c",
|
203 |
+
"metadata": {
|
204 |
+
"id": "7666823c"
|
205 |
+
},
|
206 |
+
"source": [
|
207 |
+
"# TOKENIZATION"
|
208 |
+
]
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"cell_type": "code",
|
212 |
+
"execution_count": null,
|
213 |
+
"id": "1527b229",
|
214 |
+
"metadata": {
|
215 |
+
"colab": {
|
216 |
+
"base_uri": "https://localhost:8080/"
|
217 |
+
},
|
218 |
+
"id": "1527b229",
|
219 |
+
"outputId": "3911e731-9c6a-40a7-f625-e59d4491d13e"
|
220 |
+
},
|
221 |
+
"outputs": [
|
222 |
+
{
|
223 |
+
"output_type": "stream",
|
224 |
+
"name": "stderr",
|
225 |
+
"text": [
|
226 |
+
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
|
227 |
+
"[nltk_data] Package punkt is already up-to-date!\n"
|
228 |
+
]
|
229 |
+
}
|
230 |
+
],
|
231 |
+
"source": [
|
232 |
+
"import nltk\n",
|
233 |
+
"from nltk.tokenize import word_tokenize\n",
|
234 |
+
"\n",
|
235 |
+
"# Download tokenizer if not available\n",
|
236 |
+
"nltk.download('punkt')\n",
|
237 |
+
"\n",
|
238 |
+
"# Tokenizing pseudocode and code\n",
|
239 |
+
"df[\"text_tokens\"] = df[\"text\"].apply(word_tokenize)\n",
|
240 |
+
"df[\"code_tokens\"] = df[\"code\"].apply(word_tokenize)\n",
|
241 |
+
"\n",
|
242 |
+
"\n"
|
243 |
+
]
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"cell_type": "markdown",
|
247 |
+
"id": "62e8e137",
|
248 |
+
"metadata": {
|
249 |
+
"id": "62e8e137"
|
250 |
+
},
|
251 |
+
"source": [
|
252 |
+
"# PRINT TOKEN SAMPLES"
|
253 |
+
]
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"cell_type": "code",
|
257 |
+
"execution_count": null,
|
258 |
+
"id": "3667d51e",
|
259 |
+
"metadata": {
|
260 |
+
"colab": {
|
261 |
+
"base_uri": "https://localhost:8080/"
|
262 |
+
},
|
263 |
+
"id": "3667d51e",
|
264 |
+
"outputId": "9619ff63-be61-467c-a1a4-0cf13b14fced"
|
265 |
+
},
|
266 |
+
"outputs": [
|
267 |
+
{
|
268 |
+
"output_type": "stream",
|
269 |
+
"name": "stdout",
|
270 |
+
"text": [
|
271 |
+
"Samples from index 1-10:\n",
|
272 |
+
"Index 1:\n",
|
273 |
+
"Tokenized Pseudocode: ['create', 'string', 's']\n",
|
274 |
+
"Tokenized C++ Code: ['string', 's', ';']\n",
|
275 |
+
"--------------------------------------------------\n",
|
276 |
+
"Index 2:\n",
|
277 |
+
"Tokenized Pseudocode: ['create', 'integers', 'x1', ',', 'y1', ',', 'x2', ',', 'y2']\n",
|
278 |
+
"Tokenized C++ Code: ['int', 'x1', ',', 'y1', ',', 'x2', ',', 'y2', ';']\n",
|
279 |
+
"--------------------------------------------------\n",
|
280 |
+
"Index 3:\n",
|
281 |
+
"Tokenized Pseudocode: ['read', 's']\n",
|
282 |
+
"Tokenized C++ Code: ['cin', '>', '>', 's', ';']\n",
|
283 |
+
"--------------------------------------------------\n",
|
284 |
+
"Index 4:\n",
|
285 |
+
"Tokenized Pseudocode: ['set', 'x1', 'to', 's', '[', '0', ']', '-', '96']\n",
|
286 |
+
"Tokenized C++ Code: ['x1', '=', 's', '[', '0', ']', '-', '96', ';']\n",
|
287 |
+
"--------------------------------------------------\n",
|
288 |
+
"Index 5:\n",
|
289 |
+
"Tokenized Pseudocode: ['set', 'y1', 'to', 's', '[', '1', ']', '-', \"'\", '0', \"'\"]\n",
|
290 |
+
"Tokenized C++ Code: ['y1', '=', 's', '[', '1', ']', '-', \"'\", '0', \"'\", ';']\n",
|
291 |
+
"--------------------------------------------------\n",
|
292 |
+
"Index 6:\n",
|
293 |
+
"Tokenized Pseudocode: ['read', 's']\n",
|
294 |
+
"Tokenized C++ Code: ['cin', '>', '>', 's', ';']\n",
|
295 |
+
"--------------------------------------------------\n",
|
296 |
+
"Index 7:\n",
|
297 |
+
"Tokenized Pseudocode: ['set', 'x2', 'to', 's', '[', '0', ']', '-', '96']\n",
|
298 |
+
"Tokenized C++ Code: ['x2', '=', 's', '[', '0', ']', '-', '96', ';']\n",
|
299 |
+
"--------------------------------------------------\n",
|
300 |
+
"Index 8:\n",
|
301 |
+
"Tokenized Pseudocode: ['set', 'y2', 'to', 's', '[', '1', ']', '-', \"'\", '0', \"'\"]\n",
|
302 |
+
"Tokenized C++ Code: ['y2', '=', 's', '[', '1', ']', '-', \"'\", '0', \"'\", ';']\n",
|
303 |
+
"--------------------------------------------------\n",
|
304 |
+
"Index 9:\n",
|
305 |
+
"Tokenized Pseudocode: ['print', 'maximum', 'of', 'absolute', 'value', 'of', 'x1', '-', 'x2', 'and', 'absolute', 'value', 'of', 'y1', '-', 'y2', ',', 'print', 'newline']\n",
|
306 |
+
"Tokenized C++ Code: ['cout', '<', '<', 'max', '(', 'abs', '(', 'x1', '-', 'x2', ')', ',', 'abs', '(', 'y1', '-', 'y2', ')', ')', '<', '<', 'endl', ';']\n",
|
307 |
+
"--------------------------------------------------\n",
|
308 |
+
"Index 10:\n",
|
309 |
+
"Tokenized Pseudocode: ['while', 'x1', 'is', 'not', 'x2', 'or', 'y1', 'is', 'not', 'y2']\n",
|
310 |
+
"Tokenized C++ Code: ['while', '(', 'x1', '!', '=', 'x2', '||', 'y1', '!', '=', 'y2', ')', '{']\n",
|
311 |
+
"--------------------------------------------------\n",
|
312 |
+
"\n",
|
313 |
+
"Samples from index 20-30:\n",
|
314 |
+
"Index 20:\n",
|
315 |
+
"Tokenized Pseudocode: ['print', '``', 'D', \"''\"]\n",
|
316 |
+
"Tokenized C++ Code: ['cout', '<', '<', '``', 'D', \"''\", ';']\n",
|
317 |
+
"--------------------------------------------------\n",
|
318 |
+
"Index 21:\n",
|
319 |
+
"Tokenized Pseudocode: ['decrement', 'y1']\n",
|
320 |
+
"Tokenized C++ Code: ['y1', '--', ';']\n",
|
321 |
+
"--------------------------------------------------\n",
|
322 |
+
"Index 22:\n",
|
323 |
+
"Tokenized Pseudocode: ['nan']\n",
|
324 |
+
"Tokenized C++ Code: ['}']\n",
|
325 |
+
"--------------------------------------------------\n",
|
326 |
+
"Index 23:\n",
|
327 |
+
"Tokenized Pseudocode: ['if', 'y1', 'is', 'less', 'than', 'y2']\n",
|
328 |
+
"Tokenized C++ Code: ['if', '(', 'y1', '<', 'y2', ')', '{']\n",
|
329 |
+
"--------------------------------------------------\n",
|
330 |
+
"Index 24:\n",
|
331 |
+
"Tokenized Pseudocode: ['print', '``', 'U', \"''\"]\n",
|
332 |
+
"Tokenized C++ Code: ['cout', '<', '<', '``', 'U', \"''\", ';']\n",
|
333 |
+
"--------------------------------------------------\n",
|
334 |
+
"Index 25:\n",
|
335 |
+
"Tokenized Pseudocode: ['increment', 'y1']\n",
|
336 |
+
"Tokenized C++ Code: ['y1++', ';']\n",
|
337 |
+
"--------------------------------------------------\n",
|
338 |
+
"Index 26:\n",
|
339 |
+
"Tokenized Pseudocode: ['nan']\n",
|
340 |
+
"Tokenized C++ Code: ['}']\n",
|
341 |
+
"--------------------------------------------------\n",
|
342 |
+
"Index 27:\n",
|
343 |
+
"Tokenized Pseudocode: ['print', '``', '\\\\n', \"''\"]\n",
|
344 |
+
"Tokenized C++ Code: ['cout', '<', '<', '``', '\\\\n', \"''\", ';']\n",
|
345 |
+
"--------------------------------------------------\n",
|
346 |
+
"Index 28:\n",
|
347 |
+
"Tokenized Pseudocode: ['nan']\n",
|
348 |
+
"Tokenized C++ Code: ['}']\n",
|
349 |
+
"--------------------------------------------------\n",
|
350 |
+
"Index 29:\n",
|
351 |
+
"Tokenized Pseudocode: ['nan']\n",
|
352 |
+
"Tokenized C++ Code: ['return', '0', ';']\n",
|
353 |
+
"--------------------------------------------------\n",
|
354 |
+
"Index 30:\n",
|
355 |
+
"Tokenized Pseudocode: ['nan']\n",
|
356 |
+
"Tokenized C++ Code: ['}']\n",
|
357 |
+
"--------------------------------------------------\n"
|
358 |
+
]
|
359 |
+
}
|
360 |
+
],
|
361 |
+
"source": [
|
362 |
+
"# Print samples from index 1-10\n",
|
363 |
+
"print(\"Samples from index 1-10:\")\n",
|
364 |
+
"for i in range(1, 11):\n",
|
365 |
+
" print(f\"Index {i}:\")\n",
|
366 |
+
" print(\"Tokenized Pseudocode:\", df[\"text_tokens\"].iloc[i])\n",
|
367 |
+
" print(\"Tokenized C++ Code:\", df[\"code_tokens\"].iloc[i])\n",
|
368 |
+
" print(\"-\" * 50)\n",
|
369 |
+
"\n",
|
370 |
+
"# Print samples from index 20-30\n",
|
371 |
+
"print(\"\\nSamples from index 20-30:\")\n",
|
372 |
+
"for i in range(20, 31):\n",
|
373 |
+
" print(f\"Index {i}:\")\n",
|
374 |
+
" print(\"Tokenized Pseudocode:\", df[\"text_tokens\"].iloc[i])\n",
|
375 |
+
" print(\"Tokenized C++ Code:\", df[\"code_tokens\"].iloc[i])\n",
|
376 |
+
" print(\"-\" * 50)\n"
|
377 |
+
]
|
378 |
+
},
|
379 |
+
{
|
380 |
+
"cell_type": "code",
|
381 |
+
"source": [
|
382 |
+
"# Save tokenized pseudocode and C++ code to CSV\n",
|
383 |
+
"output_file = \"tokenized_spoc.csv\"\n",
|
384 |
+
"df[[\"text_tokens\", \"code_tokens\"]].to_csv(output_file, index=False)\n",
|
385 |
+
"\n",
|
386 |
+
"print(f\"Tokenized data saved to {output_file}\")\n"
|
387 |
+
],
|
388 |
+
"metadata": {
|
389 |
+
"id": "Sd8I0TttbCaZ",
|
390 |
+
"colab": {
|
391 |
+
"base_uri": "https://localhost:8080/"
|
392 |
+
},
|
393 |
+
"outputId": "05541ebd-454f-455d-e2a3-57f6df689997"
|
394 |
+
},
|
395 |
+
"id": "Sd8I0TttbCaZ",
|
396 |
+
"execution_count": null,
|
397 |
+
"outputs": [
|
398 |
+
{
|
399 |
+
"output_type": "stream",
|
400 |
+
"name": "stdout",
|
401 |
+
"text": [
|
402 |
+
"Tokenized data saved to tokenized_spoc.csv\n"
|
403 |
+
]
|
404 |
+
}
|
405 |
+
]
|
406 |
+
},
|
407 |
+
{
|
408 |
+
"cell_type": "code",
|
409 |
+
"source": [
|
410 |
+
"# Add start and end tokens to tokenized C++ code\n",
|
411 |
+
"df[\"code_tokens\"] = df[\"code_tokens\"].apply(lambda tokens: [\"<start>\"] + tokens + [\"<end>\"])\n",
|
412 |
+
"\n",
|
413 |
+
"# Save updated tokenized data to CSV\n",
|
414 |
+
"output_file = \"tokenized_spoc_with_tokens.csv\"\n",
|
415 |
+
"df[[\"text_tokens\", \"code_tokens\"]].to_csv(output_file, index=False)\n",
|
416 |
+
"\n",
|
417 |
+
"print(f\"Updated tokenized data saved to {output_file}\")\n"
|
418 |
+
],
|
419 |
+
"metadata": {
|
420 |
+
"colab": {
|
421 |
+
"base_uri": "https://localhost:8080/"
|
422 |
+
},
|
423 |
+
"id": "-HAGLVzqXEQy",
|
424 |
+
"outputId": "59f572b7-bf31-4171-9112-2edec63c3937"
|
425 |
+
},
|
426 |
+
"id": "-HAGLVzqXEQy",
|
427 |
+
"execution_count": null,
|
428 |
+
"outputs": [
|
429 |
+
{
|
430 |
+
"output_type": "stream",
|
431 |
+
"name": "stdout",
|
432 |
+
"text": [
|
433 |
+
"Updated tokenized data saved to tokenized_spoc_with_tokens.csv\n"
|
434 |
+
]
|
435 |
+
}
|
436 |
+
]
|
437 |
+
},
|
438 |
+
{
|
439 |
+
"cell_type": "code",
|
440 |
+
"source": [
|
441 |
+
"# Make \"text_tokens\" and \"code_tokens\" length same by padding with \"<pad>\"\n",
|
442 |
+
"max_len = max(df[\"text_tokens\"].apply(len).max(), df[\"code_tokens\"].apply(len).max())\n",
|
443 |
+
"\n",
|
444 |
+
"df[\"text_tokens\"] = df[\"text_tokens\"].apply(lambda tokens: tokens + [\"<pad>\"] * (max_len - len(tokens)))\n",
|
445 |
+
"df[\"code_tokens\"] = df[\"code_tokens\"].apply(lambda tokens: tokens + [\"<pad>\"] * (max_len - len(tokens)))\n",
|
446 |
+
"\n",
|
447 |
+
"# Save padded tokenized data to CSV\n",
|
448 |
+
"output_file = \"tokenized_spoc_padded.csv\"\n",
|
449 |
+
"df[[\"text_tokens\", \"code_tokens\"]].to_csv(output_file, index=False)\n",
|
450 |
+
"\n",
|
451 |
+
"print(f\"Padded tokenized data saved to {output_file}\")\n"
|
452 |
+
],
|
453 |
+
"metadata": {
|
454 |
+
"colab": {
|
455 |
+
"base_uri": "https://localhost:8080/"
|
456 |
+
},
|
457 |
+
"id": "6tRCwKS0X25B",
|
458 |
+
"outputId": "32366e38-5c7b-4c45-cc7a-6413dc6df8da"
|
459 |
+
},
|
460 |
+
"id": "6tRCwKS0X25B",
|
461 |
+
"execution_count": null,
|
462 |
+
"outputs": [
|
463 |
+
{
|
464 |
+
"output_type": "stream",
|
465 |
+
"name": "stdout",
|
466 |
+
"text": [
|
467 |
+
"Padded tokenized data saved to tokenized_spoc_padded.csv\n"
|
468 |
+
]
|
469 |
+
}
|
470 |
+
]
|
471 |
+
},
|
472 |
+
{
|
473 |
+
"cell_type": "code",
|
474 |
+
"source": [
|
475 |
+
"import json\n",
|
476 |
+
"\n",
|
477 |
+
"# Define special tokens with fixed indices\n",
|
478 |
+
"vocab = {\n",
|
479 |
+
" \"<unk>\": 0,\n",
|
480 |
+
" \"<pad>\": 1,\n",
|
481 |
+
" \"<start>\": 2,\n",
|
482 |
+
" \"<end>\": 3\n",
|
483 |
+
"}\n",
|
484 |
+
"\n",
|
485 |
+
"# Assign indices to other tokens\n",
|
486 |
+
"for column in [\"text_tokens\", \"code_tokens\"]:\n",
|
487 |
+
" for tokens in df[column]:\n",
|
488 |
+
" for token in tokens:\n",
|
489 |
+
" if token not in vocab:\n",
|
490 |
+
" vocab[token] = len(vocab)\n",
|
491 |
+
"\n",
|
492 |
+
"# Save vocabulary to JSON\n",
|
493 |
+
"vocab_file = \"vocabulary.json\"\n",
|
494 |
+
"with open(vocab_file, \"w\") as f:\n",
|
495 |
+
" json.dump(vocab, f, indent=4)\n",
|
496 |
+
"\n",
|
497 |
+
"print(f\"Vocabulary saved to {vocab_file}\")\n"
|
498 |
+
],
|
499 |
+
"metadata": {
|
500 |
+
"colab": {
|
501 |
+
"base_uri": "https://localhost:8080/"
|
502 |
+
},
|
503 |
+
"id": "r4hbVXb5YI4-",
|
504 |
+
"outputId": "36805383-5738-4634-fc97-472fd68c399c"
|
505 |
+
},
|
506 |
+
"id": "r4hbVXb5YI4-",
|
507 |
+
"execution_count": null,
|
508 |
+
"outputs": [
|
509 |
+
{
|
510 |
+
"output_type": "stream",
|
511 |
+
"name": "stdout",
|
512 |
+
"text": [
|
513 |
+
"Vocabulary saved to vocabulary.json\n"
|
514 |
+
]
|
515 |
+
}
|
516 |
+
]
|
517 |
+
},
|
518 |
+
{
|
519 |
+
"cell_type": "code",
|
520 |
+
"source": [
|
521 |
+
"# Load vocabulary\n",
|
522 |
+
"with open(\"vocabulary.json\", \"r\") as f:\n",
|
523 |
+
" vocab = json.load(f)\n",
|
524 |
+
"\n",
|
525 |
+
"# Load tokenized data\n",
|
526 |
+
"df = pd.read_csv(\"/content/tokenized_spoc_padded.csv\")\n",
|
527 |
+
"\n",
|
528 |
+
"# Convert string tokens to lists\n",
|
529 |
+
"df[\"text_tokens\"] = df[\"text_tokens\"].apply(eval)\n",
|
530 |
+
"df[\"code_tokens\"] = df[\"code_tokens\"].apply(eval)\n",
|
531 |
+
"\n",
|
532 |
+
"# Convert tokens to sequences using vocabulary\n",
|
533 |
+
"df[\"text_sequences\"] = df[\"text_tokens\"].apply(lambda tokens: [vocab.get(token, vocab[\"<unk>\"]) for token in tokens])\n",
|
534 |
+
"df[\"code_sequences\"] = df[\"code_tokens\"].apply(lambda tokens: [vocab.get(token, vocab[\"<unk>\"]) for token in tokens])\n",
|
535 |
+
"\n",
|
536 |
+
"# Save sequences to CSV\n",
|
537 |
+
"output_file = \"tokenized_sequences.csv\"\n",
|
538 |
+
"df[[\"text_sequences\", \"code_sequences\"]].to_csv(output_file, index=False)\n",
|
539 |
+
"\n",
|
540 |
+
"print(f\"Tokenized sequences saved to {output_file}\")\n"
|
541 |
+
],
|
542 |
+
"metadata": {
|
543 |
+
"colab": {
|
544 |
+
"base_uri": "https://localhost:8080/"
|
545 |
+
},
|
546 |
+
"id": "_TEFKw4KY6VO",
|
547 |
+
"outputId": "53b130fa-6c16-4356-9979-3a87b00e53ca"
|
548 |
+
},
|
549 |
+
"id": "_TEFKw4KY6VO",
|
550 |
+
"execution_count": null,
|
551 |
+
"outputs": [
|
552 |
+
{
|
553 |
+
"output_type": "stream",
|
554 |
+
"name": "stdout",
|
555 |
+
"text": [
|
556 |
+
"Tokenized sequences saved to tokenized_sequences.csv\n"
|
557 |
+
]
|
558 |
+
}
|
559 |
+
]
|
560 |
+
},
|
561 |
+
{
|
562 |
+
"cell_type": "code",
|
563 |
+
"source": [
|
564 |
+
"from torch.utils.data import DataLoader, Dataset\n",
|
565 |
+
"import pandas as pd\n",
|
566 |
+
"import torch\n",
|
567 |
+
"import ast\n",
|
568 |
+
"from torch.nn.utils.rnn import pad_sequence\n",
|
569 |
+
"from tqdm import tqdm\n",
|
570 |
+
"\n",
|
571 |
+
"class DataLoad(Dataset):\n",
|
572 |
+
" def __init__(self, file_path):\n",
|
573 |
+
" df = pd.read_csv(file_path)\n",
|
574 |
+
" self.inputs = [ast.literal_eval(x) for x in df['text_sequences']]\n",
|
575 |
+
" self.outputs = [ast.literal_eval(x) for x in df['code_sequences']]\n",
|
576 |
+
"\n",
|
577 |
+
" def __len__(self):\n",
|
578 |
+
" return len(self.inputs)\n",
|
579 |
+
"\n",
|
580 |
+
" def __getitem__(self, idx):\n",
|
581 |
+
" input_tensor = torch.tensor(self.inputs[idx], dtype=torch.int64)\n",
|
582 |
+
" output_tensor = torch.tensor(self.outputs[idx], dtype=torch.int64)\n",
|
583 |
+
" return input_tensor, output_tensor\n",
|
584 |
+
"\n",
|
585 |
+
"def Add_Pad(batch):\n",
|
586 |
+
" inputs, outputs = zip(*batch)\n",
|
587 |
+
" inputs = pad_sequence(inputs, batch_first=True, padding_value=0)\n",
|
588 |
+
" outputs = pad_sequence(outputs, batch_first=True, padding_value=0)\n",
|
589 |
+
" return inputs, outputs\n",
|
590 |
+
"\n",
|
591 |
+
"# Load dataset and dataloader\n",
|
592 |
+
"dataset = DataLoad('/content/tokenized_sequences.csv')\n",
|
593 |
+
"dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=Add_Pad)\n",
|
594 |
+
"\n",
|
595 |
+
"# Iterate with progress bar\n",
|
596 |
+
"data_iter = iter(dataloader)\n",
|
597 |
+
"for batch in tqdm(dataloader, desc=\"Loading Batches\"):\n",
|
598 |
+
" features, labels = batch # Get a batch of data\n",
|
599 |
+
" break # Remove this if you want to iterate over all batches\n",
|
600 |
+
"\n",
|
601 |
+
"print(\"Batch loaded successfully!\")\n"
|
602 |
+
],
|
603 |
+
"metadata": {
|
604 |
+
"colab": {
|
605 |
+
"base_uri": "https://localhost:8080/"
|
606 |
+
},
|
607 |
+
"id": "8_ySZqiqaHUD",
|
608 |
+
"outputId": "87124b59-8b7b-42ba-bba0-9c9a0ca1926c"
|
609 |
+
},
|
610 |
+
"id": "8_ySZqiqaHUD",
|
611 |
+
"execution_count": 16,
|
612 |
+
"outputs": [
|
613 |
+
{
|
614 |
+
"output_type": "stream",
|
615 |
+
"name": "stderr",
|
616 |
+
"text": [
|
617 |
+
"Loading Batches: 0%| | 0/3846 [00:00<?, ?it/s]"
|
618 |
+
]
|
619 |
+
},
|
620 |
+
{
|
621 |
+
"output_type": "stream",
|
622 |
+
"name": "stdout",
|
623 |
+
"text": [
|
624 |
+
"Batch loaded successfully!\n"
|
625 |
+
]
|
626 |
+
},
|
627 |
+
{
|
628 |
+
"output_type": "stream",
|
629 |
+
"name": "stderr",
|
630 |
+
"text": [
|
631 |
+
"\n"
|
632 |
+
]
|
633 |
+
}
|
634 |
+
]
|
635 |
+
},
|
636 |
+
{
|
637 |
+
"cell_type": "code",
|
638 |
+
"source": [
|
639 |
+
"print(features)\n",
|
640 |
+
"print(labels)"
|
641 |
+
],
|
642 |
+
"metadata": {
|
643 |
+
"colab": {
|
644 |
+
"base_uri": "https://localhost:8080/"
|
645 |
+
},
|
646 |
+
"id": "BeXmffD0bl4E",
|
647 |
+
"outputId": "072ef76d-d277-4357-c108-b7529cc5cc95"
|
648 |
+
},
|
649 |
+
"id": "BeXmffD0bl4E",
|
650 |
+
"execution_count": null,
|
651 |
+
"outputs": [
|
652 |
+
{
|
653 |
+
"output_type": "stream",
|
654 |
+
"name": "stdout",
|
655 |
+
"text": [
|
656 |
+
"tensor([[ 77, 616, 16, ..., 1, 1, 1],\n",
|
657 |
+
" [ 4, 1, 1, ..., 1, 1, 1],\n",
|
658 |
+
" [2998, 378, 67, ..., 1, 1, 1],\n",
|
659 |
+
" ...,\n",
|
660 |
+
" [ 4, 1, 1, ..., 1, 1, 1],\n",
|
661 |
+
" [ 4, 1, 1, ..., 1, 1, 1],\n",
|
662 |
+
" [ 168, 8, 179, ..., 1, 1, 1]])\n",
|
663 |
+
"tensor([[ 2, 77, 50, ..., 1, 1, 1],\n",
|
664 |
+
" [ 2, 173, 18, ..., 1, 1, 1],\n",
|
665 |
+
" [ 2, 67, 87, ..., 1, 1, 1],\n",
|
666 |
+
" ...,\n",
|
667 |
+
" [ 2, 328, 3, ..., 1, 1, 1],\n",
|
668 |
+
" [ 2, 328, 3, ..., 1, 1, 1],\n",
|
669 |
+
" [ 2, 108, 179, ..., 1, 1, 1]])\n"
|
670 |
+
]
|
671 |
+
}
|
672 |
+
]
|
673 |
+
},
|
674 |
+
{
|
675 |
+
"cell_type": "code",
|
676 |
+
"source": [
|
677 |
+
"import torch\n",
|
678 |
+
"import torch.nn as nn\n",
|
679 |
+
"import torch.optim as optim\n",
|
680 |
+
"import math\n",
|
681 |
+
"\n",
|
682 |
+
"# Transformer Hyperparameters\n",
|
683 |
+
"class Config:\n",
|
684 |
+
" vocab_size = 12006 # Adjust based on vocabulary.json\n",
|
685 |
+
" max_length = 100 # Adjust based on sequence length\n",
|
686 |
+
" embed_dim = 256\n",
|
687 |
+
" num_heads = 8\n",
|
688 |
+
" num_layers =2\n",
|
689 |
+
" feedforward_dim = 512\n",
|
690 |
+
" dropout = 0.1\n",
|
691 |
+
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
692 |
+
"\n",
|
693 |
+
"config = Config()\n",
|
694 |
+
"\n",
|
695 |
+
"# Positional Encoding\n",
|
696 |
+
"class PositionalEncoding(nn.Module):\n",
|
697 |
+
" def __init__(self, embed_dim, max_len=100):\n",
|
698 |
+
" super(PositionalEncoding, self).__init__()\n",
|
699 |
+
" pe = torch.zeros(max_len, embed_dim)\n",
|
700 |
+
" position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n",
|
701 |
+
" div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))\n",
|
702 |
+
" pe[:, 0::2] = torch.sin(position * div_term)\n",
|
703 |
+
" pe[:, 1::2] = torch.cos(position * div_term)\n",
|
704 |
+
" self.pe = pe.unsqueeze(0) # Shape: (1, max_len, embed_dim)\n",
|
705 |
+
"\n",
|
706 |
+
" def forward(self, x):\n",
|
707 |
+
" return x + self.pe[:, :x.size(1)].to(x.device)\n",
|
708 |
+
"\n",
|
709 |
+
"# Transformer Model\n",
|
710 |
+
"class PseudoCodeTransformer(nn.Module):\n",
|
711 |
+
" def __init__(self, config):\n",
|
712 |
+
" super(PseudoCodeTransformer, self).__init__()\n",
|
713 |
+
" self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)\n",
|
714 |
+
" self.positional_encoding = PositionalEncoding(config.embed_dim, config.max_length)\n",
|
715 |
+
"\n",
|
716 |
+
" self.transformer = nn.Transformer(\n",
|
717 |
+
" d_model=config.embed_dim,\n",
|
718 |
+
" nhead=config.num_heads,\n",
|
719 |
+
" num_encoder_layers=config.num_layers,\n",
|
720 |
+
" num_decoder_layers=config.num_layers,\n",
|
721 |
+
" dim_feedforward=config.feedforward_dim,\n",
|
722 |
+
" dropout=config.dropout\n",
|
723 |
+
" )\n",
|
724 |
+
"\n",
|
725 |
+
" self.fc_out = nn.Linear(config.embed_dim, config.vocab_size)\n",
|
726 |
+
" self.dropout = nn.Dropout(config.dropout)\n",
|
727 |
+
"\n",
|
728 |
+
" def generate_square_subsequent_mask(self, sz):\n",
|
729 |
+
" return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(config.device)\n",
|
730 |
+
"\n",
|
731 |
+
" def forward(self, src, tgt):\n",
|
732 |
+
" src_emb = self.embedding(src) * math.sqrt(config.embed_dim)\n",
|
733 |
+
" tgt_emb = self.embedding(tgt) * math.sqrt(config.embed_dim)\n",
|
734 |
+
"\n",
|
735 |
+
" src_emb = self.positional_encoding(src_emb)\n",
|
736 |
+
" tgt_emb = self.positional_encoding(tgt_emb)\n",
|
737 |
+
"\n",
|
738 |
+
" src_mask = self.generate_square_subsequent_mask(src.size(1))\n",
|
739 |
+
" tgt_mask = self.generate_square_subsequent_mask(tgt.size(1))\n",
|
740 |
+
"\n",
|
741 |
+
" out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),\n",
|
742 |
+
" src_mask=src_mask, tgt_mask=tgt_mask)\n",
|
743 |
+
"\n",
|
744 |
+
" out = self.fc_out(out.permute(1, 0, 2)) # Convert back to batch-first\n",
|
745 |
+
" return out\n",
|
746 |
+
"\n",
|
747 |
+
"# Initialize Model\n",
|
748 |
+
"model = PseudoCodeTransformer(config).to(config.device)\n",
|
749 |
+
"print(\"Model initialized successfully!\")\n"
|
750 |
+
],
|
751 |
+
"metadata": {
|
752 |
+
"colab": {
|
753 |
+
"base_uri": "https://localhost:8080/"
|
754 |
+
},
|
755 |
+
"id": "azPgilarcWXf",
|
756 |
+
"outputId": "143ee579-1fbf-4ff8-b54f-e4b2116245e3"
|
757 |
+
},
|
758 |
+
"id": "azPgilarcWXf",
|
759 |
+
"execution_count": 2,
|
760 |
+
"outputs": [
|
761 |
+
{
|
762 |
+
"output_type": "stream",
|
763 |
+
"name": "stderr",
|
764 |
+
"text": [
|
765 |
+
"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
|
766 |
+
" warnings.warn(\n"
|
767 |
+
]
|
768 |
+
},
|
769 |
+
{
|
770 |
+
"output_type": "stream",
|
771 |
+
"name": "stdout",
|
772 |
+
"text": [
|
773 |
+
"Model initialized successfully!\n"
|
774 |
+
]
|
775 |
+
}
|
776 |
+
]
|
777 |
+
},
|
778 |
+
{
|
779 |
+
"cell_type": "code",
|
780 |
+
"source": [
|
781 |
+
"def translate(model, pseudocode_tokens, vocab, device, max_length=50):\n",
|
782 |
+
" model.eval()\n",
|
783 |
+
"\n",
|
784 |
+
" # Convert pseudocode tokens to numerical indices\n",
|
785 |
+
" input_ids = [vocab.get(token, vocab[\"<unk>\"]) for token in pseudocode_tokens]\n",
|
786 |
+
" input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device) # Add batch dimension\n",
|
787 |
+
"\n",
|
788 |
+
" # Start token for generation\n",
|
789 |
+
" output_ids = [vocab[\"<start>\"]]\n",
|
790 |
+
"\n",
|
791 |
+
" for _ in range(max_length):\n",
|
792 |
+
" output_tensor = torch.tensor(output_ids, dtype=torch.long).unsqueeze(0).to(device)\n",
|
793 |
+
"\n",
|
794 |
+
" # Get model predictions\n",
|
795 |
+
" with torch.no_grad():\n",
|
796 |
+
" predictions = model(input_tensor, output_tensor)\n",
|
797 |
+
"\n",
|
798 |
+
" # Select the most probable token\n",
|
799 |
+
" next_token_id = predictions.argmax(dim=-1)[:, -1].item()\n",
|
800 |
+
" output_ids.append(next_token_id)\n",
|
801 |
+
"\n",
|
802 |
+
" # Stop if end token is generated\n",
|
803 |
+
" if next_token_id == vocab[\"<end>\"]:\n",
|
804 |
+
" break\n",
|
805 |
+
"\n",
|
806 |
+
" # Convert token indices back to words\n",
|
807 |
+
" id_to_token = {idx: token for token, idx in vocab.items()}\n",
|
808 |
+
" translated_code = [id_to_token.get(idx, \"<unk>\") for idx in output_ids[1:]] # Exclude <start> token\n",
|
809 |
+
"\n",
|
810 |
+
" return \" \".join(translated_code)\n"
|
811 |
+
],
|
812 |
+
"metadata": {
|
813 |
+
"id": "2XsYwb5jLxAT"
|
814 |
+
},
|
815 |
+
"id": "2XsYwb5jLxAT",
|
816 |
+
"execution_count": 5,
|
817 |
+
"outputs": []
|
818 |
+
},
|
819 |
+
{
|
820 |
+
"cell_type": "code",
|
821 |
+
"source": [
|
822 |
+
"import json\n",
|
823 |
+
"\n",
|
824 |
+
"# Load vocabulary\n",
|
825 |
+
"with open(\"vocabulary.json\", \"r\") as f:\n",
|
826 |
+
" vocab = json.load(f)\n",
|
827 |
+
"\n",
|
828 |
+
"# Ensure vocab is a dictionary\n",
|
829 |
+
"print(f\"β
Vocabulary loaded with {len(vocab)} tokens\")\n"
|
830 |
+
],
|
831 |
+
"metadata": {
|
832 |
+
"colab": {
|
833 |
+
"base_uri": "https://localhost:8080/"
|
834 |
+
},
|
835 |
+
"id": "OJBeh_zNL6ZM",
|
836 |
+
"outputId": "44bee4c9-e9ea-4c09-fa91-a25e4efb3e7d"
|
837 |
+
},
|
838 |
+
"id": "OJBeh_zNL6ZM",
|
839 |
+
"execution_count": 7,
|
840 |
+
"outputs": [
|
841 |
+
{
|
842 |
+
"output_type": "stream",
|
843 |
+
"name": "stdout",
|
844 |
+
"text": [
|
845 |
+
"β
Vocabulary loaded with 12006 tokens\n"
|
846 |
+
]
|
847 |
+
}
|
848 |
+
]
|
849 |
+
},
|
850 |
+
{
|
851 |
+
"cell_type": "code",
|
852 |
+
"source": [
|
853 |
+
"from torch.utils.data import DataLoader\n",
|
854 |
+
"import torch.nn.functional as F\n",
|
855 |
+
"from tqdm import tqdm\n",
|
856 |
+
"import os\n",
|
857 |
+
"\n",
|
858 |
+
"# Check for GPU availability\n",
|
859 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
860 |
+
"print(f\"πΉ Using device: {device}\")\n",
|
861 |
+
"\n",
|
862 |
+
"# Move model to device\n",
|
863 |
+
"model.to(device)\n",
|
864 |
+
"\n",
|
865 |
+
"# Loss Function & Optimizer\n",
|
866 |
+
"criterion = nn.CrossEntropyLoss(ignore_index=1) # Ignore padding token\n",
|
867 |
+
"optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-3)\n",
|
868 |
+
"\n",
|
869 |
+
"# Create directory to save models\n",
|
870 |
+
"os.makedirs(\"checkpoints\", exist_ok=True)\n",
|
871 |
+
"\n",
|
872 |
+
"# Training Loop\n",
|
873 |
+
"num_epochs = 1\n",
|
874 |
+
"for epoch in range(num_epochs):\n",
|
875 |
+
" model.train()\n",
|
876 |
+
" epoch_loss = 0\n",
|
877 |
+
"\n",
|
878 |
+
" progress_bar = tqdm(dataloader, desc=f\"Epoch {epoch+1}/{num_epochs}\")\n",
|
879 |
+
" for batch in progress_bar:\n",
|
880 |
+
" src, tgt = batch\n",
|
881 |
+
" src, tgt = src.to(device), tgt.to(device) # Move batch to GPU\n",
|
882 |
+
"\n",
|
883 |
+
" tgt_input = tgt[:, :-1] # Remove <end> token\n",
|
884 |
+
" tgt_output = tgt[:, 1:] # Shifted version\n",
|
885 |
+
"\n",
|
886 |
+
" optimizer.zero_grad()\n",
|
887 |
+
" output = model(src, tgt_input)\n",
|
888 |
+
"\n",
|
889 |
+
" loss = criterion(output.view(-1, config.vocab_size), tgt_output.contiguous().view(-1))\n",
|
890 |
+
" loss.backward()\n",
|
891 |
+
" optimizer.step()\n",
|
892 |
+
"\n",
|
893 |
+
" epoch_loss += loss.item()\n",
|
894 |
+
" progress_bar.set_postfix(loss=loss.item())\n",
|
895 |
+
"\n",
|
896 |
+
" avg_loss = epoch_loss / len(dataloader)\n",
|
897 |
+
" print(f\"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}\")\n",
|
898 |
+
"\n",
|
899 |
+
" # Save Model Checkpoint\n",
|
900 |
+
" torch.save(model.state_dict(), f\"checkpoints/transformer_epoch_{epoch+1}.pth\")\n",
|
901 |
+
" print(f\"β
Model saved: checkpoints/transformer_epoch_{epoch+1}.pth\")\n",
|
902 |
+
"\n",
|
903 |
+
" # Print Example Prediction\n",
|
904 |
+
" model.eval()\n",
|
905 |
+
" example_pseudocode = [\"create\", \"integer\", \"x\"]\n",
|
906 |
+
" translated_code = translate(model, example_pseudocode, vocab, device)\n",
|
907 |
+
" print(f\"πΉ Example Prediction (Pseudocode β C++): {translated_code}\\n\")\n"
|
908 |
+
],
|
909 |
+
"metadata": {
|
910 |
+
"id": "SUOIS04idMXB",
|
911 |
+
"colab": {
|
912 |
+
"base_uri": "https://localhost:8080/"
|
913 |
+
},
|
914 |
+
"outputId": "9134b609-78d4-4f07-bd4a-7057ba9b5adf"
|
915 |
+
},
|
916 |
+
"id": "SUOIS04idMXB",
|
917 |
+
"execution_count": null,
|
918 |
+
"outputs": [
|
919 |
+
{
|
920 |
+
"output_type": "stream",
|
921 |
+
"name": "stdout",
|
922 |
+
"text": [
|
923 |
+
"πΉ Using device: cuda\n"
|
924 |
+
]
|
925 |
+
},
|
926 |
+
{
|
927 |
+
"output_type": "stream",
|
928 |
+
"name": "stderr",
|
929 |
+
"text": [
|
930 |
+
"Epoch 1/1: 100%|ββββββββββ| 3846/3846 [06:02<00:00, 10.62it/s, loss=0.861]\n"
|
931 |
+
]
|
932 |
+
},
|
933 |
+
{
|
934 |
+
"output_type": "stream",
|
935 |
+
"name": "stdout",
|
936 |
+
"text": [
|
937 |
+
"Epoch [1/1], Loss: 0.9374\n",
|
938 |
+
"β
Model saved: checkpoints/transformer_epoch_1.pth\n",
|
939 |
+
"πΉ Example Prediction (Pseudocode β C++): int x int x , int x , int x , int x , int x , int x , int x , int x , int x , int x , int x , int x , int x ; <end>\n",
|
940 |
+
"\n"
|
941 |
+
]
|
942 |
+
}
|
943 |
+
]
|
944 |
+
},
|
945 |
+
{
|
946 |
+
"cell_type": "code",
|
947 |
+
"source": [
|
948 |
+
"example_pseudocode = [\"for\", \"i\", \"=\", \"0\", \"to\", \"size\", \"of\", \"ans\", \"exclusive\", \",\", \"print\", \"ans\", \"[\", \"i\", \"]\", \"print\", \"newline\"]\n",
|
949 |
+
"translated_code = translate(model, example_pseudocode, vocab, device)\n",
|
950 |
+
"print(f\"πΉ Example Prediction (Pseudocode β C++): {translated_code}\\n\")"
|
951 |
+
],
|
952 |
+
"metadata": {
|
953 |
+
"colab": {
|
954 |
+
"base_uri": "https://localhost:8080/"
|
955 |
+
},
|
956 |
+
"id": "D5pxkdcUL5Gd",
|
957 |
+
"outputId": "c479d4a4-cfde-48cb-d0e7-22c74db2a500"
|
958 |
+
},
|
959 |
+
"id": "D5pxkdcUL5Gd",
|
960 |
+
"execution_count": null,
|
961 |
+
"outputs": [
|
962 |
+
{
|
963 |
+
"output_type": "stream",
|
964 |
+
"name": "stdout",
|
965 |
+
"text": [
|
966 |
+
"πΉ Example Prediction (Pseudocode β C++): for ( int i = 0 ; i < ( ans ) ; i++ ) { cout < < < < ( ans [ i ] .size ( ans [ i ] .size ( ans [ i ] .size ( ans [ i ] , 0 ] ) ] .size\n",
|
967 |
+
"\n"
|
968 |
+
]
|
969 |
+
}
|
970 |
+
]
|
971 |
+
},
|
972 |
+
{
|
973 |
+
"cell_type": "code",
|
974 |
+
"source": [
|
975 |
+
"for (int i = 0; i < ans.size(); i++) { cout << ans[i] << endl; }"
|
976 |
+
],
|
977 |
+
"metadata": {
|
978 |
+
"colab": {
|
979 |
+
"base_uri": "https://localhost:8080/"
|
980 |
+
},
|
981 |
+
"id": "hh7c0AziPqG5",
|
982 |
+
"outputId": "4f44229a-5778-4a2c-e368-680f387a6589"
|
983 |
+
},
|
984 |
+
"id": "hh7c0AziPqG5",
|
985 |
+
"execution_count": null,
|
986 |
+
"outputs": [
|
987 |
+
{
|
988 |
+
"output_type": "stream",
|
989 |
+
"name": "stdout",
|
990 |
+
"text": [
|
991 |
+
"πΉ Using device: cuda\n"
|
992 |
+
]
|
993 |
+
},
|
994 |
+
{
|
995 |
+
"output_type": "execute_result",
|
996 |
+
"data": {
|
997 |
+
"text/plain": [
|
998 |
+
"PseudoCodeTransformer(\n",
|
999 |
+
" (embedding): Embedding(12006, 256)\n",
|
1000 |
+
" (positional_encoding): PositionalEncoding()\n",
|
1001 |
+
" (transformer): Transformer(\n",
|
1002 |
+
" (encoder): TransformerEncoder(\n",
|
1003 |
+
" (layers): ModuleList(\n",
|
1004 |
+
" (0-1): 2 x TransformerEncoderLayer(\n",
|
1005 |
+
" (self_attn): MultiheadAttention(\n",
|
1006 |
+
" (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)\n",
|
1007 |
+
" )\n",
|
1008 |
+
" (linear1): Linear(in_features=256, out_features=512, bias=True)\n",
|
1009 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
1010 |
+
" (linear2): Linear(in_features=512, out_features=256, bias=True)\n",
|
1011 |
+
" (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
|
1012 |
+
" (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
|
1013 |
+
" (dropout1): Dropout(p=0.1, inplace=False)\n",
|
1014 |
+
" (dropout2): Dropout(p=0.1, inplace=False)\n",
|
1015 |
+
" )\n",
|
1016 |
+
" )\n",
|
1017 |
+
" (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
|
1018 |
+
" )\n",
|
1019 |
+
" (decoder): TransformerDecoder(\n",
|
1020 |
+
" (layers): ModuleList(\n",
|
1021 |
+
" (0-1): 2 x TransformerDecoderLayer(\n",
|
1022 |
+
" (self_attn): MultiheadAttention(\n",
|
1023 |
+
" (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)\n",
|
1024 |
+
" )\n",
|
1025 |
+
" (multihead_attn): MultiheadAttention(\n",
|
1026 |
+
" (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)\n",
|
1027 |
+
" )\n",
|
1028 |
+
" (linear1): Linear(in_features=256, out_features=512, bias=True)\n",
|
1029 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
1030 |
+
" (linear2): Linear(in_features=512, out_features=256, bias=True)\n",
|
1031 |
+
" (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
|
1032 |
+
" (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
|
1033 |
+
" (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
|
1034 |
+
" (dropout1): Dropout(p=0.1, inplace=False)\n",
|
1035 |
+
" (dropout2): Dropout(p=0.1, inplace=False)\n",
|
1036 |
+
" (dropout3): Dropout(p=0.1, inplace=False)\n",
|
1037 |
+
" )\n",
|
1038 |
+
" )\n",
|
1039 |
+
" (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)\n",
|
1040 |
+
" )\n",
|
1041 |
+
" )\n",
|
1042 |
+
" (fc_out): Linear(in_features=256, out_features=12006, bias=True)\n",
|
1043 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
1044 |
+
")"
|
1045 |
+
]
|
1046 |
+
},
|
1047 |
+
"metadata": {},
|
1048 |
+
"execution_count": 8
|
1049 |
+
}
|
1050 |
+
]
|
1051 |
+
},
|
1052 |
+
{
|
1053 |
+
"cell_type": "code",
|
1054 |
+
"source": [
|
1055 |
+
"# Load the trained model\n",
|
1056 |
+
"\n",
|
1057 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
1058 |
+
"print(f\"πΉ Using device: {device}\")\n",
|
1059 |
+
"\n",
|
1060 |
+
"# Move model to device\n",
|
1061 |
+
"model.to(device)\n",
|
1062 |
+
"\n",
|
1063 |
+
"model = PseudoCodeTransformer(config).to(device)\n",
|
1064 |
+
"model.load_state_dict(torch.load(\"/content/transformer_epoch_1.pth\", map_location=device))\n",
|
1065 |
+
"model.eval()\n",
|
1066 |
+
"\n",
|
1067 |
+
"# Run translation on example pseudocode\n",
|
1068 |
+
"example_pseudocode = [\"for\", \"i\", \"=\", \"0\", \"to\", \"size\", \"of\", \"ans\", \"exclusive\", \",\", \"print\", \"ans\", \"[\", \"i\", \"]\", \"print\", \"newline\"]\n",
|
1069 |
+
"translated_code = translate(model, example_pseudocode, vocab, device)\n",
|
1070 |
+
"\n",
|
1071 |
+
"print(f\"πΉ Example Prediction (Pseudocode β C++): {translated_code}\\n\")\n"
|
1072 |
+
],
|
1073 |
+
"metadata": {
|
1074 |
+
"id": "t-xzQokaPy_E",
|
1075 |
+
"colab": {
|
1076 |
+
"base_uri": "https://localhost:8080/"
|
1077 |
+
},
|
1078 |
+
"outputId": "9bdff1cd-1094-4ba8-df9f-9cbde4a628bd"
|
1079 |
+
},
|
1080 |
+
"id": "t-xzQokaPy_E",
|
1081 |
+
"execution_count": 8,
|
1082 |
+
"outputs": [
|
1083 |
+
{
|
1084 |
+
"output_type": "stream",
|
1085 |
+
"name": "stdout",
|
1086 |
+
"text": [
|
1087 |
+
"πΉ Using device: cuda\n"
|
1088 |
+
]
|
1089 |
+
},
|
1090 |
+
{
|
1091 |
+
"output_type": "stream",
|
1092 |
+
"name": "stderr",
|
1093 |
+
"text": [
|
1094 |
+
"<ipython-input-8-057fb80d4514>:10: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
|
1095 |
+
" model.load_state_dict(torch.load(\"/content/transformer_epoch_1.pth\", map_location=device))\n"
|
1096 |
+
]
|
1097 |
+
},
|
1098 |
+
{
|
1099 |
+
"output_type": "stream",
|
1100 |
+
"name": "stdout",
|
1101 |
+
"text": [
|
1102 |
+
"πΉ Example Prediction (Pseudocode β C++): for ( int i = 0 ; i < ( ans ) ; i++ ) { cout < < < < ( ans [ i ] .size ( ans [ i ] .size ( ans [ i ] .size ( ans [ i ] , 0 ] ) ] .size\n",
|
1103 |
+
"\n"
|
1104 |
+
]
|
1105 |
+
}
|
1106 |
+
]
|
1107 |
+
},
|
1108 |
+
{
|
1109 |
+
"cell_type": "markdown",
|
1110 |
+
"source": [
|
1111 |
+
"**C++ CODE TO PSEUDOCODE**"
|
1112 |
+
],
|
1113 |
+
"metadata": {
|
1114 |
+
"id": "CrE_8fkdRdfQ"
|
1115 |
+
},
|
1116 |
+
"id": "CrE_8fkdRdfQ"
|
1117 |
+
},
|
1118 |
+
{
|
1119 |
+
"cell_type": "code",
|
1120 |
+
"source": [
|
1121 |
+
"import torch\n",
|
1122 |
+
"import torch.nn as nn\n",
|
1123 |
+
"import torch.optim as optim\n",
|
1124 |
+
"import math\n",
|
1125 |
+
"import pandas as pd\n",
|
1126 |
+
"import ast\n",
|
1127 |
+
"import json\n",
|
1128 |
+
"from torch.utils.data import DataLoader, Dataset\n",
|
1129 |
+
"from torch.nn.utils.rnn import pad_sequence\n",
|
1130 |
+
"from tqdm import tqdm\n",
|
1131 |
+
"import os\n",
|
1132 |
+
"\n",
|
1133 |
+
"# Transformer Hyperparameters\n",
|
1134 |
+
"class Config:\n",
|
1135 |
+
" vocab_size = 12006 # Adjust based on vocabulary.json\n",
|
1136 |
+
" max_length = 100\n",
|
1137 |
+
" embed_dim = 256\n",
|
1138 |
+
" num_heads = 8\n",
|
1139 |
+
" num_layers = 2\n",
|
1140 |
+
" feedforward_dim = 512\n",
|
1141 |
+
" dropout = 0.1\n",
|
1142 |
+
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
1143 |
+
"\n",
|
1144 |
+
"config = Config()\n",
|
1145 |
+
"\n",
|
1146 |
+
"# Positional Encoding\n",
|
1147 |
+
"class PositionalEncoding(nn.Module):\n",
|
1148 |
+
" def __init__(self, embed_dim, max_len=100):\n",
|
1149 |
+
" super(PositionalEncoding, self).__init__()\n",
|
1150 |
+
" pe = torch.zeros(max_len, embed_dim)\n",
|
1151 |
+
" position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n",
|
1152 |
+
" div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))\n",
|
1153 |
+
" pe[:, 0::2] = torch.sin(position * div_term)\n",
|
1154 |
+
" pe[:, 1::2] = torch.cos(position * div_term)\n",
|
1155 |
+
" self.pe = pe.unsqueeze(0)\n",
|
1156 |
+
"\n",
|
1157 |
+
" def forward(self, x):\n",
|
1158 |
+
" return x + self.pe[:, :x.size(1)].to(x.device)\n",
|
1159 |
+
"\n",
|
1160 |
+
"# Transformer Model\n",
|
1161 |
+
"class CPPtoPseudoTransformer(nn.Module):\n",
|
1162 |
+
" def __init__(self, config):\n",
|
1163 |
+
" super(CPPtoPseudoTransformer, self).__init__()\n",
|
1164 |
+
" self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)\n",
|
1165 |
+
" self.positional_encoding = PositionalEncoding(config.embed_dim, config.max_length)\n",
|
1166 |
+
"\n",
|
1167 |
+
" self.transformer = nn.Transformer(\n",
|
1168 |
+
" d_model=config.embed_dim,\n",
|
1169 |
+
" nhead=config.num_heads,\n",
|
1170 |
+
" num_encoder_layers=config.num_layers,\n",
|
1171 |
+
" num_decoder_layers=config.num_layers,\n",
|
1172 |
+
" dim_feedforward=config.feedforward_dim,\n",
|
1173 |
+
" dropout=config.dropout\n",
|
1174 |
+
" )\n",
|
1175 |
+
"\n",
|
1176 |
+
" self.fc_out = nn.Linear(config.embed_dim, config.vocab_size)\n",
|
1177 |
+
" self.dropout = nn.Dropout(config.dropout)\n",
|
1178 |
+
"\n",
|
1179 |
+
" def generate_square_subsequent_mask(self, sz):\n",
|
1180 |
+
" return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(config.device)\n",
|
1181 |
+
"\n",
|
1182 |
+
" def forward(self, src, tgt):\n",
|
1183 |
+
" src_emb = self.embedding(src) * math.sqrt(config.embed_dim)\n",
|
1184 |
+
" tgt_emb = self.embedding(tgt) * math.sqrt(config.embed_dim)\n",
|
1185 |
+
"\n",
|
1186 |
+
" src_emb = self.positional_encoding(src_emb)\n",
|
1187 |
+
" tgt_emb = self.positional_encoding(tgt_emb)\n",
|
1188 |
+
"\n",
|
1189 |
+
" src_mask = self.generate_square_subsequent_mask(src.size(1))\n",
|
1190 |
+
" tgt_mask = self.generate_square_subsequent_mask(tgt.size(1))\n",
|
1191 |
+
"\n",
|
1192 |
+
" out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),\n",
|
1193 |
+
" src_mask=src_mask, tgt_mask=tgt_mask)\n",
|
1194 |
+
"\n",
|
1195 |
+
" out = self.fc_out(out.permute(1, 0, 2))\n",
|
1196 |
+
" return out\n",
|
1197 |
+
"\n",
|
1198 |
+
"# Initialize Model\n",
|
1199 |
+
"model = CPPtoPseudoTransformer(config).to(config.device)\n",
|
1200 |
+
"print(\"π C++ β Pseudocode Model initialized!\")\n"
|
1201 |
+
],
|
1202 |
+
"metadata": {
|
1203 |
+
"colab": {
|
1204 |
+
"base_uri": "https://localhost:8080/"
|
1205 |
+
},
|
1206 |
+
"id": "yE-UUuDGI5Az",
|
1207 |
+
"outputId": "8513b586-2c83-4a62-f86a-5bb00f3247a9"
|
1208 |
+
},
|
1209 |
+
"id": "yE-UUuDGI5Az",
|
1210 |
+
"execution_count": 17,
|
1211 |
+
"outputs": [
|
1212 |
+
{
|
1213 |
+
"output_type": "stream",
|
1214 |
+
"name": "stderr",
|
1215 |
+
"text": [
|
1216 |
+
"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/transformer.py:379: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
|
1217 |
+
" warnings.warn(\n"
|
1218 |
+
]
|
1219 |
+
},
|
1220 |
+
{
|
1221 |
+
"output_type": "stream",
|
1222 |
+
"name": "stdout",
|
1223 |
+
"text": [
|
1224 |
+
"π C++ β Pseudocode Model initialized!\n"
|
1225 |
+
]
|
1226 |
+
}
|
1227 |
+
]
|
1228 |
+
},
|
1229 |
+
{
|
1230 |
+
"cell_type": "code",
|
1231 |
+
"source": [
|
1232 |
+
"\n",
|
1233 |
+
"# Load Vocabulary\n",
|
1234 |
+
"with open(\"vocabulary.json\", \"r\") as f:\n",
|
1235 |
+
" vocab = json.load(f)\n",
|
1236 |
+
"\n",
|
1237 |
+
"print(f\"β
Vocabulary loaded with {len(vocab)} tokens\")\n",
|
1238 |
+
"\n"
|
1239 |
+
],
|
1240 |
+
"metadata": {
|
1241 |
+
"colab": {
|
1242 |
+
"base_uri": "https://localhost:8080/"
|
1243 |
+
},
|
1244 |
+
"id": "CdWsUGr4KHM1",
|
1245 |
+
"outputId": "b88217be-45e8-4c85-eed9-679b54627848"
|
1246 |
+
},
|
1247 |
+
"id": "CdWsUGr4KHM1",
|
1248 |
+
"execution_count": 18,
|
1249 |
+
"outputs": [
|
1250 |
+
{
|
1251 |
+
"output_type": "stream",
|
1252 |
+
"name": "stdout",
|
1253 |
+
"text": [
|
1254 |
+
"β
Vocabulary loaded with 12006 tokens\n"
|
1255 |
+
]
|
1256 |
+
}
|
1257 |
+
]
|
1258 |
+
},
|
1259 |
+
{
|
1260 |
+
"cell_type": "code",
|
1261 |
+
"source": [
|
1262 |
+
"def translate2(model, cpp_tokens, vocab, device, max_length=50):\n",
|
1263 |
+
" model.eval()\n",
|
1264 |
+
"\n",
|
1265 |
+
" # Convert C++ tokens to numerical indices\n",
|
1266 |
+
" input_ids = [vocab.get(token, vocab[\"<unk>\"]) for token in cpp_tokens]\n",
|
1267 |
+
" input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device) # Add batch dimension\n",
|
1268 |
+
"\n",
|
1269 |
+
" output_ids = [vocab[\"<start>\"]]\n",
|
1270 |
+
"\n",
|
1271 |
+
" for _ in range(max_length):\n",
|
1272 |
+
" output_tensor = torch.tensor(output_ids, dtype=torch.long).unsqueeze(0).to(device)\n",
|
1273 |
+
"\n",
|
1274 |
+
" # Get model predictions\n",
|
1275 |
+
" with torch.no_grad():\n",
|
1276 |
+
" predictions = model(input_tensor, output_tensor)\n",
|
1277 |
+
"\n",
|
1278 |
+
" # Select the most probable token\n",
|
1279 |
+
" next_token_id = predictions.argmax(dim=-1)[:, -1].item()\n",
|
1280 |
+
"\n",
|
1281 |
+
" if next_token_id == vocab[\"<pad>\"]: # Ignore <pad> tokens\n",
|
1282 |
+
" continue\n",
|
1283 |
+
"\n",
|
1284 |
+
" output_ids.append(next_token_id)\n",
|
1285 |
+
"\n",
|
1286 |
+
" if next_token_id == vocab[\"<end>\"]: # Stop if <end> is generated\n",
|
1287 |
+
" break\n",
|
1288 |
+
"\n",
|
1289 |
+
" # Convert token indices back to words\n",
|
1290 |
+
" id_to_token = {idx: token for token, idx in vocab.items()}\n",
|
1291 |
+
" translated_pseudocode = [id_to_token.get(idx, \"<unk>\") for idx in output_ids[1:]] # Exclude <start>\n",
|
1292 |
+
"\n",
|
1293 |
+
" return \" \".join(translated_pseudocode)\n"
|
1294 |
+
],
|
1295 |
+
"metadata": {
|
1296 |
+
"id": "BEEQ_zNHKQRY"
|
1297 |
+
},
|
1298 |
+
"id": "BEEQ_zNHKQRY",
|
1299 |
+
"execution_count": 35,
|
1300 |
+
"outputs": []
|
1301 |
+
},
|
1302 |
+
{
|
1303 |
+
"cell_type": "code",
|
1304 |
+
"source": [
|
1305 |
+
"# Dataset Class\n",
|
1306 |
+
"class CPPToPseudoDataset(Dataset):\n",
|
1307 |
+
" def __init__(self, file_path):\n",
|
1308 |
+
" df = pd.read_csv(file_path)\n",
|
1309 |
+
" self.inputs = [ast.literal_eval(x) for x in df['code_sequences']]\n",
|
1310 |
+
" self.outputs = [ast.literal_eval(x) for x in df['text_sequences']]\n",
|
1311 |
+
"\n",
|
1312 |
+
" def __len__(self):\n",
|
1313 |
+
" return len(self.inputs)\n",
|
1314 |
+
"\n",
|
1315 |
+
" def __getitem__(self, idx):\n",
|
1316 |
+
" input_tensor = torch.tensor(self.inputs[idx], dtype=torch.int64)\n",
|
1317 |
+
" output_tensor = torch.tensor([vocab[\"<start>\"]] + self.outputs[idx] + [vocab[\"<end>\"]], dtype=torch.int64)\n",
|
1318 |
+
" return input_tensor, output_tensor\n",
|
1319 |
+
"\n",
|
1320 |
+
"# Padding Function\n",
|
1321 |
+
"def Add_Pad(batch):\n",
|
1322 |
+
" inputs, outputs = zip(*batch)\n",
|
1323 |
+
" inputs = pad_sequence(inputs, batch_first=True, padding_value=0)\n",
|
1324 |
+
" outputs = pad_sequence(outputs, batch_first=True, padding_value=0)\n",
|
1325 |
+
" return inputs, outputs\n",
|
1326 |
+
"\n",
|
1327 |
+
"# Load Dataset\n",
|
1328 |
+
"dataset = CPPToPseudoDataset(\"tokenized_sequences.csv\")\n",
|
1329 |
+
"dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=Add_Pad)\n",
|
1330 |
+
"\n",
|
1331 |
+
"print(f\"β
Loaded {len(dataset)} examples for training\")\n",
|
1332 |
+
"\n",
|
1333 |
+
"# Training Configuration\n",
|
1334 |
+
"criterion = nn.CrossEntropyLoss(ignore_index=vocab[\"<pad>\"])\n",
|
1335 |
+
"optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-3)\n",
|
1336 |
+
"\n",
|
1337 |
+
"# Create directory to save models\n",
|
1338 |
+
"os.makedirs(\"checkpoints\", exist_ok=True)\n"
|
1339 |
+
],
|
1340 |
+
"metadata": {
|
1341 |
+
"colab": {
|
1342 |
+
"base_uri": "https://localhost:8080/"
|
1343 |
+
},
|
1344 |
+
"id": "t3HIGFZCKNaB",
|
1345 |
+
"outputId": "f9ac82d1-ff8a-45bc-aa06-9e78738a4021"
|
1346 |
+
},
|
1347 |
+
"id": "t3HIGFZCKNaB",
|
1348 |
+
"execution_count": 34,
|
1349 |
+
"outputs": [
|
1350 |
+
{
|
1351 |
+
"output_type": "stream",
|
1352 |
+
"name": "stdout",
|
1353 |
+
"text": [
|
1354 |
+
"β
Loaded 246086 examples for training\n"
|
1355 |
+
]
|
1356 |
+
}
|
1357 |
+
]
|
1358 |
+
},
|
1359 |
+
{
|
1360 |
+
"cell_type": "code",
|
1361 |
+
"source": [
|
1362 |
+
"# Training Loop\n",
|
1363 |
+
"num_epochs = 1\n",
|
1364 |
+
"for epoch in range(num_epochs):\n",
|
1365 |
+
" model.train()\n",
|
1366 |
+
" epoch_loss = 0\n",
|
1367 |
+
"\n",
|
1368 |
+
" progress_bar = tqdm(dataloader, desc=f\"Epoch {epoch+1}/{num_epochs}\")\n",
|
1369 |
+
" for batch in progress_bar:\n",
|
1370 |
+
" src, tgt = batch\n",
|
1371 |
+
" src, tgt = src.to(config.device), tgt.to(config.device)\n",
|
1372 |
+
"\n",
|
1373 |
+
" tgt_input = tgt[:, :-1]\n",
|
1374 |
+
" tgt_output = tgt[:, 1:]\n",
|
1375 |
+
"\n",
|
1376 |
+
" optimizer.zero_grad()\n",
|
1377 |
+
" output = model(src, tgt_input)\n",
|
1378 |
+
"\n",
|
1379 |
+
" loss = criterion(output.view(-1, config.vocab_size), tgt_output.contiguous().view(-1))\n",
|
1380 |
+
" loss.backward()\n",
|
1381 |
+
" optimizer.step()\n",
|
1382 |
+
"\n",
|
1383 |
+
" epoch_loss += loss.item()\n",
|
1384 |
+
" progress_bar.set_postfix(loss=loss.item())\n",
|
1385 |
+
"\n",
|
1386 |
+
" avg_loss = epoch_loss / len(dataloader)\n",
|
1387 |
+
" print(f\"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}\")\n",
|
1388 |
+
"\n",
|
1389 |
+
" # Save Model Checkpoint\n",
|
1390 |
+
" torch.save(model.state_dict(), f\"checkpoints/cpp_to_pseudo_epoch_{epoch+1}.pth\")\n",
|
1391 |
+
" print(f\"β
Model saved: checkpoints/cpp_to_pseudo_epoch_{epoch+1}.pth\")\n",
|
1392 |
+
"\n",
|
1393 |
+
" # Print Example Prediction\n",
|
1394 |
+
" model.eval()\n",
|
1395 |
+
" example_cpp = [\"int\", \"main\", \"(\", \")\", \"{\", \"return\", \"0\", \";\", \"}\"]\n",
|
1396 |
+
" translated_pseudocode = translate2(model, example_cpp, vocab, config.device)\n",
|
1397 |
+
" print(f\"πΉ Example Prediction (C++ β Pseudocode): {translated_pseudocode}\\n\")\n"
|
1398 |
+
],
|
1399 |
+
"metadata": {
|
1400 |
+
"colab": {
|
1401 |
+
"base_uri": "https://localhost:8080/"
|
1402 |
+
},
|
1403 |
+
"id": "QXd9vz5EKT8u",
|
1404 |
+
"outputId": "692ad75d-6c39-4f68-e577-7bc88d4ccea3"
|
1405 |
+
},
|
1406 |
+
"id": "QXd9vz5EKT8u",
|
1407 |
+
"execution_count": 36,
|
1408 |
+
"outputs": [
|
1409 |
+
{
|
1410 |
+
"output_type": "stream",
|
1411 |
+
"name": "stderr",
|
1412 |
+
"text": [
|
1413 |
+
"Epoch 1/1: 100%|ββββββββββ| 3846/3846 [06:10<00:00, 10.38it/s, loss=1.84]\n"
|
1414 |
+
]
|
1415 |
+
},
|
1416 |
+
{
|
1417 |
+
"output_type": "stream",
|
1418 |
+
"name": "stdout",
|
1419 |
+
"text": [
|
1420 |
+
"Epoch [1/1], Loss: 0.9463\n",
|
1421 |
+
"β
Model saved: checkpoints/cpp_to_pseudo_epoch_1.pth\n",
|
1422 |
+
"πΉ Example Prediction (C++ β Pseudocode): nan return 0 from function return 0 to int function nan return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 return 0 from function return 0 return 0 return 0 return\n",
|
1423 |
+
"\n"
|
1424 |
+
]
|
1425 |
+
}
|
1426 |
+
]
|
1427 |
+
},
|
1428 |
+
{
|
1429 |
+
"cell_type": "code",
|
1430 |
+
"source": [
|
1431 |
+
"# Load the trained model\n",
|
1432 |
+
"\n",
|
1433 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
1434 |
+
"print(f\"πΉ Using device: {device}\")\n",
|
1435 |
+
"\n",
|
1436 |
+
"# Move model to device\n",
|
1437 |
+
"model.to(device)\n",
|
1438 |
+
"\n",
|
1439 |
+
"model = PseudoCodeTransformer(config).to(device)\n",
|
1440 |
+
"model.load_state_dict(torch.load(\"/content/checkpoints/cpp_to_pseudo_epoch_1.pth\", map_location=device))\n",
|
1441 |
+
"model.eval()\n",
|
1442 |
+
"\n",
|
1443 |
+
"example_cpp = [\"int\", \"a\",\"=\", \"10\",\";\"]\n",
|
1444 |
+
"translated_pseudocode = translate2(model, example_cpp, vocab, config.device)\n",
|
1445 |
+
"print(f\"πΉ Example Prediction (C++ β Pseudocode): {translated_pseudocode}\\n\")\n"
|
1446 |
+
],
|
1447 |
+
"metadata": {
|
1448 |
+
"colab": {
|
1449 |
+
"base_uri": "https://localhost:8080/"
|
1450 |
+
},
|
1451 |
+
"id": "tfjIcOTkK33z",
|
1452 |
+
"outputId": "2e481125-7130-4e79-c213-528162a252e8"
|
1453 |
+
},
|
1454 |
+
"id": "tfjIcOTkK33z",
|
1455 |
+
"execution_count": 40,
|
1456 |
+
"outputs": [
|
1457 |
+
{
|
1458 |
+
"output_type": "stream",
|
1459 |
+
"name": "stdout",
|
1460 |
+
"text": [
|
1461 |
+
"πΉ Using device: cuda\n"
|
1462 |
+
]
|
1463 |
+
},
|
1464 |
+
{
|
1465 |
+
"output_type": "stream",
|
1466 |
+
"name": "stderr",
|
1467 |
+
"text": [
|
1468 |
+
"<ipython-input-40-3042f1a40ae4>:10: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
|
1469 |
+
" model.load_state_dict(torch.load(\"/content/checkpoints/cpp_to_pseudo_epoch_1.pth\", map_location=device))\n"
|
1470 |
+
]
|
1471 |
+
},
|
1472 |
+
{
|
1473 |
+
"output_type": "stream",
|
1474 |
+
"name": "stdout",
|
1475 |
+
"text": [
|
1476 |
+
"πΉ Example Prediction (C++ β Pseudocode): create integer a a with a = 10 a = 10 10 integer with a = 10 10 10 10 a 10 a 10 a 10 a 10 a 10 a 10 a 10 integer with a = 10 a 10 a 10 a 10 a 10 a 10 a\n",
|
1477 |
+
"\n"
|
1478 |
+
]
|
1479 |
+
}
|
1480 |
+
]
|
1481 |
+
},
|
1482 |
+
{
|
1483 |
+
"cell_type": "code",
|
1484 |
+
"source": [],
|
1485 |
+
"metadata": {
|
1486 |
+
"id": "vBmHMGCJM3Cw"
|
1487 |
+
},
|
1488 |
+
"id": "vBmHMGCJM3Cw",
|
1489 |
+
"execution_count": null,
|
1490 |
+
"outputs": []
|
1491 |
+
}
|
1492 |
+
],
|
1493 |
+
"metadata": {
|
1494 |
+
"kernelspec": {
|
1495 |
+
"display_name": "Python 3",
|
1496 |
+
"name": "python3"
|
1497 |
+
},
|
1498 |
+
"language_info": {
|
1499 |
+
"codemirror_mode": {
|
1500 |
+
"name": "ipython",
|
1501 |
+
"version": 3
|
1502 |
+
},
|
1503 |
+
"file_extension": ".py",
|
1504 |
+
"mimetype": "text/x-python",
|
1505 |
+
"name": "python",
|
1506 |
+
"nbconvert_exporter": "python",
|
1507 |
+
"pygments_lexer": "ipython3",
|
1508 |
+
"version": "3.11.7"
|
1509 |
+
},
|
1510 |
+
"colab": {
|
1511 |
+
"provenance": [],
|
1512 |
+
"gpuType": "T4"
|
1513 |
+
},
|
1514 |
+
"accelerator": "GPU"
|
1515 |
+
},
|
1516 |
+
"nbformat": 4,
|
1517 |
+
"nbformat_minor": 5
|
1518 |
+
}
|
transformer_epoch_1.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a5266dc7e1a780031bc3f7b90ad9d5b301120d03e3dc6c8fc0dbd1ccbeeefc7
|
3 |
+
size 40493679
|
vocabulary.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|