chilly-magician commited on
Commit
6a7f508
1 Parent(s): fe21650

[add]: test parser script

Browse files
scripts/calculate_metrics.py ADDED
File without changes
scripts/test_parser.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ from typing import Optional, Tuple
6
+ from tqdm.auto import tqdm
7
+
8
+ import torch
9
+
10
+ from datasets import DatasetDict, load_dataset
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM
12
+
13
+ def check_base_path(path: str) -> Optional[str]:
14
+ if path is not None:
15
+ base_path = os.path.basename(path)
16
+ if os.path.exists(base_path):
17
+ return path
18
+ else:
19
+ raise Exception(f'Path not found {base_path}')
20
+ return path
21
+
22
+
23
+ def parse_args():
24
+ DEFAULT_MODEL_ID = 'EmbeddingStudio/query-parser-falcon-7b-instruct'
25
+ DEFAULT_DATASET = 'EmbeddingStudio/query-parsing-instructions-falcon'
26
+ DEFAULT_SPLIT = 'test'
27
+ DEFAULT_INSTRUCTION_FIELD = 'text'
28
+ DEFAULT_RESPONSE_DELIMITER = '## Response:\n'
29
+ DEFAULT_CATEGORY_DELIMITER = '## Category:'
30
+ DEFAULT_OUTPUT_PATH = f'{DEFAULT_MODEL_ID.split("/")[-1]}-test.json'
31
+
32
+ parser = argparse.ArgumentParser(description='EmbeddingStudio script for testing Zero-Shot Search Query Parsers')
33
+ parser.add_argument("--model-id",
34
+ help=f"Huggingface model ID (default: {DEFAULT_MODEL_ID})",
35
+ default=DEFAULT_MODEL_ID,
36
+ type=str,
37
+ )
38
+ parser.add_argument("--dataset-name",
39
+ help=f"Huggingface dataset name which contains instructions (default: {DEFAULT_DATASET})",
40
+ default=DEFAULT_DATASET,
41
+ type=str,
42
+ )
43
+ parser.add_argument("--dataset-split",
44
+ help=f"Huggingface dataset split name (default: {DEFAULT_SPLIT})",
45
+ default=DEFAULT_SPLIT,
46
+ type=str,
47
+ )
48
+ parser.add_argument("--dataset-instructions-field",
49
+ help=f"Huggingface dataset field with instructions (default: {DEFAULT_INSTRUCTION_FIELD})",
50
+ default=DEFAULT_INSTRUCTION_FIELD,
51
+ type=str,
52
+ )
53
+ parser.add_argument("--instructions-response-delimiter",
54
+ help=f"Instruction response delimiter (default: {DEFAULT_RESPONSE_DELIMITER})",
55
+ default=DEFAULT_RESPONSE_DELIMITER,
56
+ type=str,
57
+ )
58
+ parser.add_argument("--instructions-category-delimiter",
59
+ help=f"Instruction category name delimiter (default: {DEFAULT_CATEGORY_DELIMITER})",
60
+ default=DEFAULT_CATEGORY_DELIMITER,
61
+ type=str,
62
+ )
63
+
64
+ parser.add_argument("--output",
65
+ help=f"JSON file with test results (default: {DEFAULT_OUTPUT_PATH})",
66
+ default=DEFAULT_OUTPUT_PATH,
67
+ type=check_base_path,
68
+ )
69
+ args = parser.parse_args()
70
+ return args
71
+
72
+
73
+ def load_model(model_id: str) -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
74
+ tokenizer = AutoTokenizer.from_pretrained(
75
+ model_id,
76
+ trust_remote_code=True,
77
+ add_prefix_space=True,
78
+ use_fast=False,
79
+ )
80
+ tokenizer.pad_token = tokenizer.eos_token
81
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map={"": 0})
82
+ return tokenizer, model
83
+
84
+
85
+ @torch.no_grad()
86
+ def predict(
87
+ tokenizer: AutoTokenizer,
88
+ model: AutoModelForCausalLM,
89
+ dataset: DatasetDict,
90
+ index: int,
91
+ field_name: str = 'text',
92
+ response_delimiter: str = '## Response:\n',
93
+ category_delimiter: str = '## Category: '
94
+ ) -> Tuple[dict, dict, str]:
95
+ input_text = dataset[index][field_name].split(response_delimiter)[0] + response_delimiter
96
+ input_ids = tokenizer.encode(input_text, return_tensors='pt')
97
+ real = json.loads(dataset[index][field_name].split(response_delimiter)[-1])
98
+ category = dataset[index][field_name].split(category_delimiter)[-1].split('\n')[0]
99
+
100
+ # Generating text
101
+ output = model.generate(input_ids.to('cuda'),
102
+ max_new_tokens=1000,
103
+ do_sample=True,
104
+ temperature=0.05,
105
+ pad_token_id=50256
106
+ )
107
+ parsed = json.loads(tokenizer.decode(output[0], skip_special_tokens=True).split(response_delimiter)[-1])
108
+
109
+ return [parsed, real, category]
110
+
111
+
112
+ @torch.no_grad()
113
+ def test_model(model_id: str,
114
+ dataset_name: str,
115
+ split_name: str,
116
+ field_name: str,
117
+ response_delimiter: str,
118
+ category_delimiter: str,
119
+ output_path: str,
120
+
121
+ ):
122
+ dataset = load_dataset(dataset_name, split=split_name)
123
+ tokenizer, model = load_model(model_id)
124
+ model.eval()
125
+
126
+ test_results = []
127
+ for index in tqdm(range(len(dataset[split_name]))):
128
+ try:
129
+ test_results.append(predict(tokenizer, model, dataset[split_name], index, field_name, response_delimiter, category_delimiter))
130
+ except Exception as e:
131
+ continue
132
+
133
+ with open(output_path, 'w') as f:
134
+ json.dump(test_results)
135
+
136
+
137
+
138
+ if __name__ == '__main__':
139
+ args = parse_args()
140
+ test_model(
141
+ args.model_id,
142
+ args.dataset_name,
143
+ args.dataset_split,
144
+ args.dataset_instructions_field,
145
+ args.instructions_response_delimiter,
146
+ args.instructions_category_delimiter,
147
+ args.output
148
+ )