Spaces:
Sleeping
Sleeping
File size: 7,004 Bytes
a25d266 0a83766 997991d de7d627 145afb1 a25d266 b0226fd a25d266 3caf963 a25d266 8ad732e c8ab462 c899f78 c8ab462 2882051 c8ab462 2882051 de7d627 59782fa 1a4cbe3 c4b5351 1a4cbe3 c5b798d 1a4cbe3 c5b798d 1a4cbe3 c5b798d 1a4cbe3 59782fa a8d1617 145afb1 a8d1617 145afb1 c647b17 3237630 c647b17 a8d1617 bb19d37 6df398b b75b974 de8227a b75b974 de8227a 6df398b de8227a 6df398b de8227a b75b974 145afb1 bb19d37 b2c4316 a25d266 b0226fd a25d266 8ad732e a25d266 145afb1 b0226fd a25d266 94541f6 a25d266 94541f6 a25d266 bf9a7e9 a25d266 bf9a7e9 2aad587 bf9a7e9 2aad587 bf9a7e9 a25d266 bf9a7e9 2aad587 bf9a7e9 2aad587 bf9a7e9 b2c4316 1a4cbe3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
# app.py
import os
import json
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from huggingface_hub import HfFolder
import subprocess
from AppointmentScheduler import AppointmentScheduler
# Authenticate Hugging Face Hub
hf_token = st.secrets["HF_TOKEN"]
HfFolder.save_token(hf_token)
def set_git_config():
try:
subprocess.run(['git', 'config', '--global', 'user.email', 'nilesh.hanotia@outlook.com'], check=True)
subprocess.run(['git', 'config', '--global', 'user.name', 'Nilesh'], check=True)
st.success("Git configuration set successfully.")
except subprocess.CalledProcessError as e:
st.error(f"Git configuration error: {str(e)}")
set_git_config()
@st.cache_data
def load_data(file_paths):
combined_data = []
for file_path in file_paths:
file_path = file_path.strip()
if not os.path.exists(file_path):
st.error(f"File not found: {file_path}")
return None
try:
with open(file_path, 'r') as f:
data = json.load(f)
# Add a print to inspect the data structure
print(f"Data loaded from {file_path}: {data}")
# Assuming you're expecting 'intents' with 'examples'
if 'intents' in data:
for intent in data['intents']:
combined_data.extend(intent['examples'])
else:
st.error(f"Invalid format in file: {file_path}")
return None
except Exception as e:
st.error(f"Error loading dataset from {file_path}: {str(e)}")
return None
print(f"Combined data: {combined_data}") # Check the combined dataset
return combined_data
@st.cache_resource
def initialize_model_and_tokenizer(model_name, num_labels):
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token
# Update the model config
model.config.pad_token_id = tokenizer.pad_token_id
# Resize the token embeddings as we added a new token
model.resize_token_embeddings(len(tokenizer))
return tokenizer, model
except Exception as e:
st.error(f"Error initializing model and tokenizer: {str(e)}")
return None, None
def create_dataset(data, tokenizer, max_length, num_labels):
texts = [item.get('prompt', '') for item in data if item.get('prompt')]
labels = [item.get('label', 0) for item in data if item.get('prompt')]
if not texts:
raise ValueError("The input texts list is empty. Please check your data.")
# Ensure all labels are within the valid range
labels = [label if 0 <= label < num_labels else 0 for label in labels]
# Tokenize the input texts with proper padding and truncation
encodings = tokenizer(
texts,
truncation=True,
padding='max_length',
max_length=max_length,
return_tensors='pt'
)
dataset = Dataset.from_dict({
'input_ids': encodings['input_ids'],
'attention_mask': encodings['attention_mask'],
'labels': labels
})
return dataset
def main():
st.title("Appointment Scheduling Platform")
model_name = st.text_input("Enter model name", "distilgpt2")
file_paths = st.text_area("Enter training data paths").split(',')
max_length = st.number_input("Max token length", 128)
num_epochs = st.number_input("Training epochs", 3)
batch_size = st.number_input("Batch size", 8)
learning_rate = st.number_input("Learning rate", 5e-5)
num_labels = 3
repo_id = st.text_input("Hugging Face Repo ID", "nileshhanotia/PeVe")
tokenizer, model = initialize_model_and_tokenizer(model_name, num_labels)
if tokenizer and model:
data = load_data(file_paths)
if data:
print(f"Total data loaded: {len(data)}")
print(f"Sample data item: {data[0] if data else 'No data'}")
train_data, eval_data = data[:int(len(data)*0.8)], data[int(len(data)*0.8):]
print(f"Train data size: {len(train_data)}, Eval data size: {len(eval_data)}")
train_dataset = create_dataset(train_data, tokenizer, max_length, num_labels)
eval_dataset = create_dataset(eval_data, tokenizer, max_length, num_labels)
print(f"Train dataset size: {len(train_dataset)}, Eval dataset size: {len(eval_dataset)}")
print(f"Sample train item: {train_dataset[0] if train_dataset else 'No data'}")
training_args = TrainingArguments(
output_dir='./results',
evaluation_strategy='epoch',
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
logging_dir='./logs',
push_to_hub=True,
hub_model_id=repo_id,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
if st.button('Start Training'):
st.write("Training model...")
trainer.train()
trainer.push_to_hub()
st.write(f"Model pushed to: {repo_id}")
# Integrate AppointmentScheduler
st.header("Appointment Scheduler")
# Initialize session state for conversation history and scheduler
if 'conversation_history' not in st.session_state:
st.session_state.conversation_history = []
st.session_state.scheduler = AppointmentScheduler()
st.session_state.first_interaction = True
user_input = st.text_input("Enter patient response")
if user_input:
# If it's the first interaction, start with the greeting
if st.session_state.first_interaction:
response = st.session_state.scheduler.handle_incoming_speech("hello")
st.session_state.conversation_history.append(("Assistant", response))
st.session_state.first_interaction = False
# Use AppointmentScheduler to handle the response
response = st.session_state.scheduler.handle_incoming_speech(user_input)
st.session_state.conversation_history.append(("Patient", user_input))
st.session_state.conversation_history.append(("Assistant", response))
# Display conversation history
for speaker, message in st.session_state.conversation_history:
st.write(f"{speaker}: {message}")
if __name__ == "__main__":
main()
|