Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -28,13 +28,19 @@ set_git_config()
|
|
28 |
def load_data(file_paths):
|
29 |
combined_data = []
|
30 |
for file_path in file_paths:
|
|
|
31 |
if not os.path.exists(file_path):
|
32 |
st.error(f"File not found: {file_path}")
|
33 |
return None
|
34 |
try:
|
35 |
with open(file_path, 'r') as f:
|
36 |
data = json.load(f)
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
38 |
except Exception as e:
|
39 |
st.error(f"Error loading dataset from {file_path}: {str(e)}")
|
40 |
return None
|
@@ -56,8 +62,8 @@ def initialize_model_and_tokenizer(model_name, num_labels):
|
|
56 |
return None, None
|
57 |
|
58 |
def create_dataset(data, tokenizer, max_length):
|
59 |
-
texts = [item
|
60 |
-
labels = [item
|
61 |
|
62 |
encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
|
63 |
dataset = Dataset.from_dict({
|
@@ -68,6 +74,8 @@ def create_dataset(data, tokenizer, max_length):
|
|
68 |
return dataset
|
69 |
|
70 |
def split_data(data, test_size=0.2):
|
|
|
|
|
71 |
random.shuffle(data)
|
72 |
split_index = int(len(data) * (1 - test_size))
|
73 |
return data[:split_index], data[split_index:]
|
@@ -100,7 +108,11 @@ def main():
|
|
100 |
|
101 |
st.write("Preparing dataset...")
|
102 |
# Split the data into train and evaluation sets
|
103 |
-
|
|
|
|
|
|
|
|
|
104 |
|
105 |
train_dataset = create_dataset(train_data, tokenizer, max_length)
|
106 |
eval_dataset = create_dataset(eval_data, tokenizer, max_length)
|
|
|
28 |
def load_data(file_paths):
|
29 |
combined_data = []
|
30 |
for file_path in file_paths:
|
31 |
+
file_path = file_path.strip()
|
32 |
if not os.path.exists(file_path):
|
33 |
st.error(f"File not found: {file_path}")
|
34 |
return None
|
35 |
try:
|
36 |
with open(file_path, 'r') as f:
|
37 |
data = json.load(f)
|
38 |
+
if 'intents' in data:
|
39 |
+
for intent in data['intents']:
|
40 |
+
combined_data.extend(intent['examples'])
|
41 |
+
else:
|
42 |
+
st.error(f"Invalid format in file: {file_path}")
|
43 |
+
return None
|
44 |
except Exception as e:
|
45 |
st.error(f"Error loading dataset from {file_path}: {str(e)}")
|
46 |
return None
|
|
|
62 |
return None, None
|
63 |
|
64 |
def create_dataset(data, tokenizer, max_length):
|
65 |
+
texts = [item.get('prompt', '') for item in data]
|
66 |
+
labels = [item.get('label', -1) for item in data]
|
67 |
|
68 |
encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
|
69 |
dataset = Dataset.from_dict({
|
|
|
74 |
return dataset
|
75 |
|
76 |
def split_data(data, test_size=0.2):
|
77 |
+
if not data:
|
78 |
+
raise ValueError("Data is empty, cannot split.")
|
79 |
random.shuffle(data)
|
80 |
split_index = int(len(data) * (1 - test_size))
|
81 |
return data[:split_index], data[split_index:]
|
|
|
108 |
|
109 |
st.write("Preparing dataset...")
|
110 |
# Split the data into train and evaluation sets
|
111 |
+
try:
|
112 |
+
train_data, eval_data = split_data(data)
|
113 |
+
except ValueError as e:
|
114 |
+
st.error(f"Data splitting error: {str(e)}")
|
115 |
+
return
|
116 |
|
117 |
train_dataset = create_dataset(train_data, tokenizer, max_length)
|
118 |
eval_dataset = create_dataset(eval_data, tokenizer, max_length)
|