nileshhanotia commited on
Commit
c4b5351
·
verified ·
1 Parent(s): 2133124

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -4
app.py CHANGED
@@ -28,13 +28,19 @@ set_git_config()
28
  def load_data(file_paths):
29
  combined_data = []
30
  for file_path in file_paths:
 
31
  if not os.path.exists(file_path):
32
  st.error(f"File not found: {file_path}")
33
  return None
34
  try:
35
  with open(file_path, 'r') as f:
36
  data = json.load(f)
37
- combined_data.extend(data)
 
 
 
 
 
38
  except Exception as e:
39
  st.error(f"Error loading dataset from {file_path}: {str(e)}")
40
  return None
@@ -56,8 +62,8 @@ def initialize_model_and_tokenizer(model_name, num_labels):
56
  return None, None
57
 
58
  def create_dataset(data, tokenizer, max_length):
59
- texts = [item['prompt'] for item in data]
60
- labels = [item['label'] for item in data]
61
 
62
  encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
63
  dataset = Dataset.from_dict({
@@ -68,6 +74,8 @@ def create_dataset(data, tokenizer, max_length):
68
  return dataset
69
 
70
  def split_data(data, test_size=0.2):
 
 
71
  random.shuffle(data)
72
  split_index = int(len(data) * (1 - test_size))
73
  return data[:split_index], data[split_index:]
@@ -100,7 +108,11 @@ def main():
100
 
101
  st.write("Preparing dataset...")
102
  # Split the data into train and evaluation sets
103
- train_data, eval_data = split_data(data)
 
 
 
 
104
 
105
  train_dataset = create_dataset(train_data, tokenizer, max_length)
106
  eval_dataset = create_dataset(eval_data, tokenizer, max_length)
 
28
  def load_data(file_paths):
29
  combined_data = []
30
  for file_path in file_paths:
31
+ file_path = file_path.strip()
32
  if not os.path.exists(file_path):
33
  st.error(f"File not found: {file_path}")
34
  return None
35
  try:
36
  with open(file_path, 'r') as f:
37
  data = json.load(f)
38
+ if 'intents' in data:
39
+ for intent in data['intents']:
40
+ combined_data.extend(intent['examples'])
41
+ else:
42
+ st.error(f"Invalid format in file: {file_path}")
43
+ return None
44
  except Exception as e:
45
  st.error(f"Error loading dataset from {file_path}: {str(e)}")
46
  return None
 
62
  return None, None
63
 
64
  def create_dataset(data, tokenizer, max_length):
65
+ texts = [item.get('prompt', '') for item in data]
66
+ labels = [item.get('label', -1) for item in data]
67
 
68
  encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
69
  dataset = Dataset.from_dict({
 
74
  return dataset
75
 
76
  def split_data(data, test_size=0.2):
77
+ if not data:
78
+ raise ValueError("Data is empty, cannot split.")
79
  random.shuffle(data)
80
  split_index = int(len(data) * (1 - test_size))
81
  return data[:split_index], data[split_index:]
 
108
 
109
  st.write("Preparing dataset...")
110
  # Split the data into train and evaluation sets
111
+ try:
112
+ train_data, eval_data = split_data(data)
113
+ except ValueError as e:
114
+ st.error(f"Data splitting error: {str(e)}")
115
+ return
116
 
117
  train_dataset = create_dataset(train_data, tokenizer, max_length)
118
  eval_dataset = create_dataset(eval_data, tokenizer, max_length)