Update readme

Browse files

Files changed (5) hide show

README.md +62 -143
config.json +2 -3
generation_config.json +1 -1
tokenizer.json +6 -1
tokenizer_config.json +1 -6

README.md CHANGED Viewed

@@ -1,168 +1,87 @@
 ---
-license: other
-license_name: deepseek-license
-license_link: LICENSE
 ---
-<p align="center">
-<img width="1000px" alt="DeepSeek Coder" src="https://github.com/deepseek-ai/DeepSeek-Coder/blob/main/pictures/logo.png?raw=true">
-</p>
-<p align="center"><a href="https://www.deepseek.com/">[🏠Homepage]</a>  |  <a href="https://coder.deepseek.com/">[🤖 Chat with DeepSeek Coder]</a>  |  <a href="https://discord.gg/Tc7c45Zzu5">[Discord]</a>  |  <a href="https://github.com/guoday/assert/blob/main/QR.png?raw=true">[Wechat(微信)]</a> </p>
-<hr>
-### 1. Introduction of Deepseek Coder
-Deepseek Coder is composed of a series of code language models, each trained from scratch on 2T tokens, with a composition of 87% code and 13% natural language in both English and Chinese. We provide various sizes of the code model, ranging from 1B to 33B versions. Each model is pre-trained on project-level code corpus by employing a window size of 16K and a extra fill-in-the-blank task, to support  project-level code completion and infilling. For coding capabilities, Deepseek Coder achieves state-of-the-art performance among open-source code models on multiple programming languages and various benchmarks.
-- **Massive Training Data**: Trained from scratch on 2T tokens, including 87% code and 13% linguistic data in both English and Chinese languages.
-- **Highly Flexible & Scalable**: Offered in model sizes of 1.3B, 5.7B, 6.7B, and 33B, enabling users to choose the setup most suitable for their requirements.
-- **Superior Model Performance**: State-of-the-art performance among publicly available code models on HumanEval, MultiPL-E, MBPP, DS-1000, and APPS benchmarks.
-- **Advanced Code Completion Capabilities**: A window size of 16K and a fill-in-the-blank task, supporting project-level code completion and infilling tasks.
-### 2. Model Summary
-deepseek-coder-1.3b-base is a 1.3B parameter model with Multi-Head Attention trained on 1 trillion tokens.
-- **Home Page:** [DeepSeek](https://deepseek.com/)
-- **Repository:** [deepseek-ai/deepseek-coder](https://github.com/deepseek-ai/deepseek-coder)
-- **Chat With DeepSeek Coder:** [DeepSeek-Coder](https://coder.deepseek.com/)
 ### 3. How to Use
-Here give some examples of how to use our model.
-#### 1）Code Completion
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True).cuda()
-input_text = "#write a quick sort algorithm"
-inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
-outputs = model.generate(**inputs, max_length=128)
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-```
-#### 2）Code Insertion
 ```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True).cuda()
-input_text = """<｜fim▁begin｜>def quick_sort(arr):
-    if len(arr) <= 1:
-        return arr
-    pivot = arr[0]
-    left = []
-    right = []
-<｜fim▁hole｜>
-        if arr[i] < pivot:
-            left.append(arr[i])
-        else:
-            right.append(arr[i])
-    return quick_sort(left) + [pivot] + quick_sort(right)<｜fim▁end｜>"""
-inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
-outputs = model.generate(**inputs, max_length=128)
-print(tokenizer.decode(outputs[0], skip_special_tokens=True)[len(input_text):])
 ```
-#### 3）Repository Level Code Completion
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True).cuda()
-input_text = """#utils.py
 import torch
-from sklearn import datasets
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
-from sklearn.metrics import accuracy_score
-def load_data():
-    iris = datasets.load_iris()
-    X = iris.data
-    y = iris.target
-    # Standardize the data
-    scaler = StandardScaler()
-    X = scaler.fit_transform(X)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
-    # Convert numpy data to PyTorch tensors
-    X_train = torch.tensor(X_train, dtype=torch.float32)
-    X_test = torch.tensor(X_test, dtype=torch.float32)
-    y_train = torch.tensor(y_train, dtype=torch.int64)
-    y_test = torch.tensor(y_test, dtype=torch.int64)
-    return X_train, X_test, y_train, y_test
-def evaluate_predictions(y_test, y_pred):
-    return accuracy_score(y_test, y_pred)
-#model.py
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import DataLoader, TensorDataset
-class IrisClassifier(nn.Module):
-    def __init__(self):
-        super(IrisClassifier, self).__init__()
-        self.fc = nn.Sequential(
-            nn.Linear(4, 16),
-            nn.ReLU(),
-            nn.Linear(16, 3)
-        )
-    def forward(self, x):
-        return self.fc(x)
-    def train_model(self, X_train, y_train, epochs, lr, batch_size):
-        criterion = nn.CrossEntropyLoss()
-        optimizer = optim.Adam(self.parameters(), lr=lr)
-        # Create DataLoader for batches
-        dataset = TensorDataset(X_train, y_train)
-        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
-        for epoch in range(epochs):
-            for batch_X, batch_y in dataloader:
-                optimizer.zero_grad()
-                outputs = self(batch_X)
-                loss = criterion(outputs, batch_y)
-                loss.backward()
-                optimizer.step()
-    def predict(self, X_test):
-        with torch.no_grad():
-            outputs = self(X_test)
-            _, predicted = outputs.max(1)
-        return predicted.numpy()
-#main.py
-from utils import load_data, evaluate_predictions
-from model import IrisClassifier as Classifier
-def main():
-    # Model training and evaluation
-"""
-inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
-outputs = model.generate(**inputs, max_new_tokens=140)
-print(tokenizer.decode(outputs[0]))
 ```
 ### 4. License
-This code repository is licensed under the MIT License. The use of DeepSeek Coder models is subject to the Model License. DeepSeek Coder supports commercial use.
-See the [LICENSE-MODEL](https://github.com/deepseek-ai/deepseek-coder/blob/main/LICENSE-MODEL) for more details.
 ### 5. Contact
-If you have any questions, please raise an issue or contact us at [agi_code@deepseek.com](mailto:agi_code@deepseek.com).

 ---
+license: mit
+tags:
+- decompile
+- binary
 ---
+### 1. Introduction of LLM4Decompile
+LLM4Decompile aims to decompile x86 assembly instructions into C. It is finetuned from Deepseek-Coder on 4B tokens of assembly-C pairs compiled from AnghaBench.
+- **Github Repository:** [LLM4Compile](https://github.com/albertan017/LLM4Decompile)
+### 2. Evaluation Results
+| Model              | Re-compilability |           |           |           |           | Re-executability |           |           |           |           |
+|--------------------|:----------------:|:---------:|:---------:|:---------:|:---------:|:----------------:|-----------|-----------|-----------|:---------:|
+| Optimization-level          | O0               | O1        | O2        | O3        | Avg.      | O0               | O1        | O2        | O3        | Avg.      |
+| GPT4               | 0.92             | 0.94      | 0.88      | 0.84      | 0.895     | 0.1341           | 0.1890    | 0.1524    | 0.0854    | 0.1402    |
+| DeepSeek-Coder-33B |   0.0659         |   0.0866  |   0.1500  |   0.1463  |   0.1122  |   0.0000         |   0.0000  |   0.0000  |   0.0000  |   0.0000  |
+| LLM4Decompile-1b   |   0.8780         |   0.8732  |   0.8683  |   0.8378  |   0.8643  |   0.1573         |   0.0768  |   0.1000  |   0.0878  |   0.1055  |
+| LLM4Decompile-6b   |   0.8817         |   0.8951  |   0.8671  |   0.8476  |   0.8729  |   0.3000         |   0.1732  |   0.1988  |   0.1841  |   0.2140  |
+| LLM4Decompile-33b  |   0.8134         |   0.8195  |   0.8183  |   0.8305  |   0.8204  |   0.3049         |   0.1902  |   0.1817  |   0.1817  |   0.2146  |
 ### 3. How to Use
+Here give an example of how to use our model.
+First compile the C code into binary, disassemble the binary into assembly instructions:
 ```python
+import subprocess
+import os
+import re
+digit_pattern = r'\b0x[a-fA-F0-9]+\b'#  binary codes in Hexadecimal
+zeros_pattern = r'^0+\s'#0s
+OPT = ["O0", "O1", "O2", "O3"]
+before = f"# This is the assembly code with {opt_state} optimization:\n"
+after = "\n# What is the source code?\n"
+fileName = 'path/to/file'
+with open(fileName+'.c','r') as f:#original file
+    c_func = f.read()
+for opt_state in OPT:
+    output_file = fileName +'_' + opt_state
+    input_file = fileName+'.c'
+    compile_command = f'gcc -c -o {output_file}.o {input_file} -{opt_state} -lm'#compile the code with GCC on Linux
+    subprocess.run(compile_command, shell=True, check=True)
+    compile_command = f'objdump -d {output_file}.o > {output_file}.s'#disassemble the binary file into assembly instructions
+    subprocess.run(compile_command, shell=True, check=True)
+    input_asm = ''
+    asm = read_file(output_file+'.s')
+    asm = asm.split('Disassembly of section .text:')[-1].strip()
+    for tmp in asm.split('\n'):
+        tmp_asm = tmp.split('\t')[-1]#remove the binary code
+        tmp_asm = tmp_asm.split('#')[0].strip()#remove the comments
+        input_asm+=tmp_asm+'\n'
+    input_asm = re.sub(zeros_pattern, '', input_asm)
+    input_asm_prompt = before+input_asm.strip()+after
+    with open(fileName +'_' + opt_state +'.asm','w',encoding='utf-8') as f:
+        f.write(input_asm_prompt)
 ```
+Then use LLM4Decompile to translate the assembly instructions into C:
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+model_path = 'arise-sustech/llm4decompile-1.3b'
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda()
+with open(fileName +'_' + opt_state +'.asm','r') as f:#original file
+    asm_func = f.read()
+inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=512)
+c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
 ```
 ### 4. License
+This code repository is licensed under the MIT License.
 ### 5. Contact
+If you have any questions, please raise an issue.

config.json CHANGED Viewed

@@ -1,10 +1,9 @@
 {
-  "_name_or_path": "deepseek-ai/deepseek-coder-1.3b-base",
   "architectures": [
     "LlamaForCausalLM"
   ],
   "attention_bias": false,
-  "attention_dropout": 0.0,
   "bos_token_id": 32013,
   "eos_token_id": 32014,
   "hidden_act": "silu",
@@ -25,7 +24,7 @@
   "rope_theta": 100000,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.39.3",
   "use_cache": false,
   "vocab_size": 32256
 }

 {
+  "_name_or_path": "/share/luoqi/models/deepseek-coder-1.3b-base",
   "architectures": [
     "LlamaForCausalLM"
   ],
   "attention_bias": false,
   "bos_token_id": 32013,
   "eos_token_id": 32014,
   "hidden_act": "silu",
   "rope_theta": 100000,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.34.1",
   "use_cache": false,
   "vocab_size": 32256
 }

generation_config.json CHANGED Viewed

@@ -2,5 +2,5 @@
   "_from_model_config": true,
   "bos_token_id": 32013,
   "eos_token_id": 32014,
-  "transformers_version": "4.39.3"
 }

   "_from_model_config": true,
   "bos_token_id": 32013,
   "eos_token_id": 32014,
+  "transformers_version": "4.34.1"
 }

tokenizer.json CHANGED Viewed

@@ -1,6 +1,11 @@
 {
   "version": "1.0",
-  "truncation": null,
   "padding": null,
   "added_tokens": [
     {

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 16384,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
   "padding": null,
   "added_tokens": [
     {

tokenizer_config.json CHANGED Viewed

@@ -1,6 +1,4 @@
 {
-  "add_bos_token": true,
-  "add_eos_token": false,
   "added_tokens_decoder": {
     "32000": {
       "content": "õ",
@@ -180,16 +178,13 @@
     }
   },
   "bos_token": "<｜begin▁of▁sentence｜>",
-  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content }}{% endif %}{% endfor %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<｜end▁of▁sentence｜>",
   "legacy": true,
   "model_max_length": 16384,
   "pad_token": "<｜end▁of▁sentence｜>",
-  "padding_side": "right",
   "sp_model_kwargs": {},
-  "split_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
   "unk_token": null,
-  "use_default_system_prompt": false
 }

 {
   "added_tokens_decoder": {
     "32000": {
       "content": "õ",
     }
   },
   "bos_token": "<｜begin▁of▁sentence｜>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<｜end▁of▁sentence｜>",
   "legacy": true,
   "model_max_length": 16384,
   "pad_token": "<｜end▁of▁sentence｜>",
   "sp_model_kwargs": {},
   "tokenizer_class": "LlamaTokenizer",
   "unk_token": null,
+  "use_default_system_prompt": true
 }