404NotF0und
commited on
Commit
•
400b307
1
Parent(s):
a5033ce
Update README.md
Browse files
README.md
CHANGED
@@ -3,8 +3,17 @@ tags:
|
|
3 |
- autotrain
|
4 |
- text-generation
|
5 |
widget:
|
6 |
-
- text:
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
---
|
9 |
|
10 |
# Model Trained Using AutoTrain
|
@@ -12,29 +21,128 @@ license: other
|
|
12 |
This model was trained using AutoTrain. For more information, please visit [AutoTrain](https://hf.co/docs/autotrain).
|
13 |
|
14 |
# Usage
|
|
|
|
|
|
|
|
|
|
|
15 |
|
|
|
16 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
model_path = "PATH_TO_THIS_REPO"
|
21 |
|
22 |
-
tokenizer
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
# Prompt content: "hi"
|
30 |
-
messages = [
|
31 |
-
{"role": "user", "content": "hi"}
|
32 |
-
]
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
37 |
|
38 |
-
#
|
39 |
-
print(
|
|
|
40 |
```
|
|
|
3 |
- autotrain
|
4 |
- text-generation
|
5 |
widget:
|
6 |
+
- text: >-
|
7 |
+
Create the Forge script for this magic card { "name": "Wrench", "mana_cost":
|
8 |
+
"{W}", "type_line": "Artifact— Clue Equipment", "oracle_text": "Equipped
|
9 |
+
creature gets +1/+1 and has vigilance and "{3}, {T}: Tap target creature."
|
10 |
+
{2}, Sacrifice CARD_NAME: Draw a card. Equip {2}'"}
|
11 |
+
license: mit
|
12 |
+
metrics:
|
13 |
+
- accuracy
|
14 |
+
- perplexity
|
15 |
+
datasets:
|
16 |
+
- 404NotF0und/MtG-json-to-ForgeScript
|
17 |
---
|
18 |
|
19 |
# Model Trained Using AutoTrain
|
|
|
21 |
This model was trained using AutoTrain. For more information, please visit [AutoTrain](https://hf.co/docs/autotrain).
|
22 |
|
23 |
# Usage
|
24 |
+
- Do some installations first
|
25 |
+
```
|
26 |
+
pip install transformers datasets matplotlib pandas git-lfs jiwer tqdm numpy
|
27 |
+
git clone https://huggingface.co/datasets/404NotF0und/MtG-json-to-ForgeScribe
|
28 |
+
```
|
29 |
|
30 |
+
The following code are an example of the usage done on a kaggle notebook
|
31 |
```python
|
32 |
+
import torch
|
33 |
+
import random
|
34 |
+
import csv
|
35 |
+
import pandas as pd
|
36 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
37 |
+
from collections.abc import Sequence
|
38 |
|
39 |
+
# Function to read the CSV files and extract the relevant columns
|
40 |
+
def read_dataset(file_path):
|
41 |
+
print(f"Reading dataset from {file_path}")
|
42 |
+
data = []
|
43 |
+
with open(file_path, encoding="utf-8") as csv_file:
|
44 |
+
csv_reader = csv.DictReader(csv_file) # Use DictReader to handle columns by name
|
45 |
+
for row in csv_reader:
|
46 |
+
json_input = f"{row['instruction']} {row['input']}" # Assuming 'input' column contains the JSON input
|
47 |
+
target_dsl = row["output"] # Assuming 'output' column contains the target DSL
|
48 |
+
data.append((json_input, target_dsl))
|
49 |
+
return data
|
50 |
|
|
|
51 |
|
52 |
+
# Function to load the model and tokenizer from Hugging Face
|
53 |
+
def load_model(model_name, read_token, device):
|
54 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, token=read_token)
|
55 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, token=read_token)
|
56 |
+
return tokenizer, model
|
57 |
+
|
58 |
+
# Function to run inference (text generation)
|
59 |
+
def run_inference(model, tokenizer, prompt, max_length=300):
|
60 |
+
# Encode the prompt text
|
61 |
+
input_ids = tokenizer.encode(prompt, return_tensors='pt')
|
62 |
+
|
63 |
+
# Generate text using the model
|
64 |
+
output_sequences = model.generate(
|
65 |
+
input_ids=input_ids,
|
66 |
+
max_length=max_length,
|
67 |
+
temperature=0.5,
|
68 |
+
top_k=50,
|
69 |
+
top_p=0.95,
|
70 |
+
pad_token_id=tokenizer.eos_token_id,
|
71 |
+
do_sample=True
|
72 |
+
)
|
73 |
+
|
74 |
+
# Decode the generated text
|
75 |
+
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
|
76 |
+
|
77 |
+
print(generated_text.split('###')[1])
|
78 |
+
|
79 |
+
return generated_text.split('###')[1]
|
80 |
+
```
|
81 |
+
```python
|
82 |
+
read_token = 'hf_YOUR_TOKEN'
|
83 |
+
|
84 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
85 |
+
model_name = '404NotF0und/lunar-llm-phi-2-3epoch'
|
86 |
+
|
87 |
+
# Load the datasets
|
88 |
+
validation_path = f"MtG-json-to-ForgeScribe/compiled_cards_data_validation.csv"
|
89 |
+
test_path = f"MtG-json-to-ForgeScribe/compiled_cards_data_test.csv"
|
90 |
+
train_path = f"MtG-json-to-ForgeScribe/compiled_cards_data_train.csv"
|
91 |
+
|
92 |
+
# Read the datasets
|
93 |
+
validation_data = read_dataset(validation_path)
|
94 |
+
test_data = read_dataset(test_path)
|
95 |
+
train_data = read_dataset(test_path)
|
96 |
+
```
|
97 |
+
|
98 |
+
```python
|
99 |
+
def get_random_prompts(dataset, num_samples=3):
|
100 |
+
if not isinstance(dataset, Sequence):
|
101 |
+
dataset = list(dataset)
|
102 |
+
|
103 |
+
if len(dataset) < num_samples:
|
104 |
+
raise ValueError(f"Dataset does not have enough elements to sample {num_samples} items.")
|
105 |
+
|
106 |
+
random_elements = random.sample(dataset, num_samples)
|
107 |
+
|
108 |
+
# Create a list of dictionaries with 'json_input' and 'max_length' for each selected element
|
109 |
+
prompts = [
|
110 |
+
{
|
111 |
+
'json_input': element[0],
|
112 |
+
'max_length': len(f"{element[0]}\n### Response: {element[1]}") # Calculate the length of the response
|
113 |
+
}
|
114 |
+
for element in random_elements
|
115 |
+
]
|
116 |
+
|
117 |
+
return prompts
|
118 |
+
|
119 |
+
# Now you can populate the prompts variable with 6 random elements from each dataset
|
120 |
+
try:
|
121 |
+
prompts = [
|
122 |
+
{
|
123 |
+
'json_input': "Create the Forge script for this magic card { \"name\": \"Wrench\", \"mana_cost\": \"{W}\", \"type_line\": \"Artifact\u2014 Clue Equipment\", \"oracle_text\": \"Equipped creature gets +1/+1 and has vigilance and \"{3}, {T}: Tap target creature.\"\n{2}, Sacrifice CARD_NAME: Draw a card.\nEquip {2}'\"}",
|
124 |
+
'max_length': 100
|
125 |
+
}
|
126 |
+
]
|
127 |
+
except ValueError as e:
|
128 |
+
print(e)
|
129 |
+
|
130 |
+
```
|
131 |
+
|
132 |
+
```python
|
133 |
+
# Load the model and tokenizer
|
134 |
+
tokenizer, model = load_model(model_name, read_token, device)
|
135 |
+
|
136 |
|
|
|
|
|
|
|
|
|
137 |
|
138 |
+
for prompt in prompts:
|
139 |
+
|
140 |
+
print(f"### Question: {prompt['json_input']} \n")
|
141 |
+
print("\n" + "-"*80 + "\n")
|
142 |
+
# Run inference (text generation)
|
143 |
+
generated_text = run_inference(model, tokenizer, prompt['json_input'])
|
144 |
|
145 |
+
# Print the generated text
|
146 |
+
# print(generated_text)
|
147 |
+
print("\n" + "="*80 + "\n") # Separator for readability
|
148 |
```
|