Spaces:
No application file
No application file
| # loading model and Library | |
| from transformers import pipeline | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from transformers import DataCollatorWithPadding | |
| pipe = pipeline("SQL_Query_Generator", model="defog/sqlcoder-34b-alpha") | |
| tokenizer = AutoTokenizer.from_pretrained("defog/sqlcoder-34b-alpha") | |
| model = AutoModelForCausalLM.from_pretrained("defog/sqlcoder-34b-alpha") | |
| raw_dataset= load_datset('sql_train_dataset.json') | |
| #%% section 1 (preparing the dataset for fine tunning) | |
| def tokenize_func(df): | |
| return tokenizer(df['question'],df['answer'],truncation=True) | |
| tokenize_dataset=raw_dataset.map(tokenize_func,batched=True) | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") | |
| tf_train_dataset = tokenized_dataset["train"].to_tf_dataset( | |
| columns=["attention_mask", "input_ids", "token_type_ids"], | |
| label_cols=["answer"], | |
| shuffle=True, | |
| collate_fn=data_collator, | |
| batch_size=8, | |
| ) | |
| tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset( | |
| columns=["attention_mask", "input_ids", "token_type_ids"], | |
| label_cols=["answer"], | |
| shuffle=False, | |
| collate_fn=data_collator, | |
| batch_size=8, | |
| ) |