404NotF0und commited on
Commit
400b307
1 Parent(s): a5033ce

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +127 -19
README.md CHANGED
@@ -3,8 +3,17 @@ tags:
3
  - autotrain
4
  - text-generation
5
  widget:
6
- - text: "I love AutoTrain because "
7
- license: other
 
 
 
 
 
 
 
 
 
8
  ---
9
 
10
  # Model Trained Using AutoTrain
@@ -12,29 +21,128 @@ license: other
12
  This model was trained using AutoTrain. For more information, please visit [AutoTrain](https://hf.co/docs/autotrain).
13
 
14
  # Usage
 
 
 
 
 
15
 
 
16
  ```python
 
 
 
 
 
 
17
 
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
19
 
20
- model_path = "PATH_TO_THIS_REPO"
21
 
22
- tokenizer = AutoTokenizer.from_pretrained(model_path)
23
- model = AutoModelForCausalLM.from_pretrained(
24
- model_path,
25
- device_map="auto",
26
- torch_dtype='auto'
27
- ).eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # Prompt content: "hi"
30
- messages = [
31
- {"role": "user", "content": "hi"}
32
- ]
33
 
34
- input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
35
- output_ids = model.generate(input_ids.to('cuda'))
36
- response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
 
 
 
37
 
38
- # Model response: "Hello! How can I assist you today?"
39
- print(response)
 
40
  ```
 
3
  - autotrain
4
  - text-generation
5
  widget:
6
+ - text: >-
7
+ Create the Forge script for this magic card { "name": "Wrench", "mana_cost":
8
+ "{W}", "type_line": "Artifact— Clue Equipment", "oracle_text": "Equipped
9
+ creature gets +1/+1 and has vigilance and "{3}, {T}: Tap target creature."
10
+ {2}, Sacrifice CARD_NAME: Draw a card. Equip {2}'"}
11
+ license: mit
12
+ metrics:
13
+ - accuracy
14
+ - perplexity
15
+ datasets:
16
+ - 404NotF0und/MtG-json-to-ForgeScript
17
  ---
18
 
19
  # Model Trained Using AutoTrain
 
21
  This model was trained using AutoTrain. For more information, please visit [AutoTrain](https://hf.co/docs/autotrain).
22
 
23
  # Usage
24
+ - Do some installations first
25
+ ```
26
+ pip install transformers datasets matplotlib pandas git-lfs jiwer tqdm numpy
27
+ git clone https://huggingface.co/datasets/404NotF0und/MtG-json-to-ForgeScribe
28
+ ```
29
 
30
+ The following code are an example of the usage done on a kaggle notebook
31
  ```python
32
+ import torch
33
+ import random
34
+ import csv
35
+ import pandas as pd
36
+ from transformers import AutoTokenizer, AutoModelForCausalLM
37
+ from collections.abc import Sequence
38
 
39
+ # Function to read the CSV files and extract the relevant columns
40
+ def read_dataset(file_path):
41
+ print(f"Reading dataset from {file_path}")
42
+ data = []
43
+ with open(file_path, encoding="utf-8") as csv_file:
44
+ csv_reader = csv.DictReader(csv_file) # Use DictReader to handle columns by name
45
+ for row in csv_reader:
46
+ json_input = f"{row['instruction']} {row['input']}" # Assuming 'input' column contains the JSON input
47
+ target_dsl = row["output"] # Assuming 'output' column contains the target DSL
48
+ data.append((json_input, target_dsl))
49
+ return data
50
 
 
51
 
52
+ # Function to load the model and tokenizer from Hugging Face
53
+ def load_model(model_name, read_token, device):
54
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=read_token)
55
+ model = AutoModelForCausalLM.from_pretrained(model_name, token=read_token)
56
+ return tokenizer, model
57
+
58
+ # Function to run inference (text generation)
59
+ def run_inference(model, tokenizer, prompt, max_length=300):
60
+ # Encode the prompt text
61
+ input_ids = tokenizer.encode(prompt, return_tensors='pt')
62
+
63
+ # Generate text using the model
64
+ output_sequences = model.generate(
65
+ input_ids=input_ids,
66
+ max_length=max_length,
67
+ temperature=0.5,
68
+ top_k=50,
69
+ top_p=0.95,
70
+ pad_token_id=tokenizer.eos_token_id,
71
+ do_sample=True
72
+ )
73
+
74
+ # Decode the generated text
75
+ generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
76
+
77
+ print(generated_text.split('###')[1])
78
+
79
+ return generated_text.split('###')[1]
80
+ ```
81
+ ```python
82
+ read_token = 'hf_YOUR_TOKEN'
83
+
84
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
85
+ model_name = '404NotF0und/lunar-llm-phi-2-3epoch'
86
+
87
+ # Load the datasets
88
+ validation_path = f"MtG-json-to-ForgeScribe/compiled_cards_data_validation.csv"
89
+ test_path = f"MtG-json-to-ForgeScribe/compiled_cards_data_test.csv"
90
+ train_path = f"MtG-json-to-ForgeScribe/compiled_cards_data_train.csv"
91
+
92
+ # Read the datasets
93
+ validation_data = read_dataset(validation_path)
94
+ test_data = read_dataset(test_path)
95
+ train_data = read_dataset(test_path)
96
+ ```
97
+
98
+ ```python
99
+ def get_random_prompts(dataset, num_samples=3):
100
+ if not isinstance(dataset, Sequence):
101
+ dataset = list(dataset)
102
+
103
+ if len(dataset) < num_samples:
104
+ raise ValueError(f"Dataset does not have enough elements to sample {num_samples} items.")
105
+
106
+ random_elements = random.sample(dataset, num_samples)
107
+
108
+ # Create a list of dictionaries with 'json_input' and 'max_length' for each selected element
109
+ prompts = [
110
+ {
111
+ 'json_input': element[0],
112
+ 'max_length': len(f"{element[0]}\n### Response: {element[1]}") # Calculate the length of the response
113
+ }
114
+ for element in random_elements
115
+ ]
116
+
117
+ return prompts
118
+
119
+ # Now you can populate the prompts variable with 6 random elements from each dataset
120
+ try:
121
+ prompts = [
122
+ {
123
+ 'json_input': "Create the Forge script for this magic card { \"name\": \"Wrench\", \"mana_cost\": \"{W}\", \"type_line\": \"Artifact\u2014 Clue Equipment\", \"oracle_text\": \"Equipped creature gets +1/+1 and has vigilance and \"{3}, {T}: Tap target creature.\"\n{2}, Sacrifice CARD_NAME: Draw a card.\nEquip {2}'\"}",
124
+ 'max_length': 100
125
+ }
126
+ ]
127
+ except ValueError as e:
128
+ print(e)
129
+
130
+ ```
131
+
132
+ ```python
133
+ # Load the model and tokenizer
134
+ tokenizer, model = load_model(model_name, read_token, device)
135
+
136
 
 
 
 
 
137
 
138
+ for prompt in prompts:
139
+
140
+ print(f"### Question: {prompt['json_input']} \n")
141
+ print("\n" + "-"*80 + "\n")
142
+ # Run inference (text generation)
143
+ generated_text = run_inference(model, tokenizer, prompt['json_input'])
144
 
145
+ # Print the generated text
146
+ # print(generated_text)
147
+ print("\n" + "="*80 + "\n") # Separator for readability
148
  ```