File size: 7,004 Bytes
a25d266
0a83766
997991d
de7d627
145afb1
a25d266
b0226fd
a25d266
3caf963
a25d266
8ad732e
c8ab462
 
 
c899f78
c8ab462
 
 
 
2882051
c8ab462
 
 
2882051
de7d627
59782fa
1a4cbe3
 
 
c4b5351
1a4cbe3
 
 
 
 
 
c5b798d
 
 
 
 
 
 
 
 
 
1a4cbe3
c5b798d
1a4cbe3
c5b798d
1a4cbe3
59782fa
a8d1617
145afb1
a8d1617
 
145afb1
c647b17
 
 
 
3237630
 
 
c647b17
 
 
a8d1617
 
 
 
bb19d37
 
6df398b
 
b75b974
 
 
 
de8227a
b75b974
 
de8227a
 
 
 
6df398b
de8227a
6df398b
de8227a
b75b974
145afb1
 
 
 
 
 
bb19d37
b2c4316
a25d266
b0226fd
 
a25d266
 
 
 
 
 
8ad732e
a25d266
145afb1
b0226fd
a25d266
 
 
94541f6
 
 
a25d266
94541f6
 
 
 
 
 
 
 
a25d266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf9a7e9
a25d266
 
bf9a7e9
2aad587
bf9a7e9
 
2aad587
bf9a7e9
 
a25d266
 
bf9a7e9
 
2aad587
bf9a7e9
 
 
2aad587
 
bf9a7e9
 
 
 
 
 
 
b2c4316
1a4cbe3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# app.py
import os
import json
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from huggingface_hub import HfFolder
import subprocess
from AppointmentScheduler import AppointmentScheduler

# Authenticate Hugging Face Hub
hf_token = st.secrets["HF_TOKEN"]
HfFolder.save_token(hf_token)

def set_git_config():
    try:
        subprocess.run(['git', 'config', '--global', 'user.email', 'nilesh.hanotia@outlook.com'], check=True)
        subprocess.run(['git', 'config', '--global', 'user.name', 'Nilesh'], check=True)
        st.success("Git configuration set successfully.")
    except subprocess.CalledProcessError as e:
        st.error(f"Git configuration error: {str(e)}")

set_git_config()

@st.cache_data
def load_data(file_paths):
    combined_data = []
    for file_path in file_paths:
        file_path = file_path.strip()
        if not os.path.exists(file_path):
            st.error(f"File not found: {file_path}")
            return None
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            # Add a print to inspect the data structure
            print(f"Data loaded from {file_path}: {data}")
            
            # Assuming you're expecting 'intents' with 'examples'
            if 'intents' in data:
                for intent in data['intents']:
                    combined_data.extend(intent['examples'])
            else:
                st.error(f"Invalid format in file: {file_path}")
                return None
        except Exception as e:
            st.error(f"Error loading dataset from {file_path}: {str(e)}")
            return None
    print(f"Combined data: {combined_data}")  # Check the combined dataset
    return combined_data

@st.cache_resource
def initialize_model_and_tokenizer(model_name, num_labels):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        
        # Set the padding token to the EOS token
        tokenizer.pad_token = tokenizer.eos_token
        
        # Update the model config
        model.config.pad_token_id = tokenizer.pad_token_id
        
        # Resize the token embeddings as we added a new token
        model.resize_token_embeddings(len(tokenizer))
        
        return tokenizer, model
    except Exception as e:
        st.error(f"Error initializing model and tokenizer: {str(e)}")
        return None, None

def create_dataset(data, tokenizer, max_length, num_labels):
    texts = [item.get('prompt', '') for item in data if item.get('prompt')]
    labels = [item.get('label', 0) for item in data if item.get('prompt')]

    if not texts:
        raise ValueError("The input texts list is empty. Please check your data.")

    # Ensure all labels are within the valid range
    labels = [label if 0 <= label < num_labels else 0 for label in labels]

    # Tokenize the input texts with proper padding and truncation
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

    dataset = Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    })
    return dataset
    
def main():
    st.title("Appointment Scheduling Platform")

    model_name = st.text_input("Enter model name", "distilgpt2")
    file_paths = st.text_area("Enter training data paths").split(',')
    max_length = st.number_input("Max token length", 128)
    num_epochs = st.number_input("Training epochs", 3)
    batch_size = st.number_input("Batch size", 8)
    learning_rate = st.number_input("Learning rate", 5e-5)
    num_labels = 3
    
    repo_id = st.text_input("Hugging Face Repo ID", "nileshhanotia/PeVe")
    tokenizer, model = initialize_model_and_tokenizer(model_name, num_labels)

    if tokenizer and model:
        data = load_data(file_paths)
        if data:
            print(f"Total data loaded: {len(data)}")
            print(f"Sample data item: {data[0] if data else 'No data'}")
            
            train_data, eval_data = data[:int(len(data)*0.8)], data[int(len(data)*0.8):]
            print(f"Train data size: {len(train_data)}, Eval data size: {len(eval_data)}")
            
            train_dataset = create_dataset(train_data, tokenizer, max_length, num_labels)
            eval_dataset = create_dataset(eval_data, tokenizer, max_length, num_labels)
            
            print(f"Train dataset size: {len(train_dataset)}, Eval dataset size: {len(eval_dataset)}")
            print(f"Sample train item: {train_dataset[0] if train_dataset else 'No data'}")
            
            training_args = TrainingArguments(
                output_dir='./results',
                evaluation_strategy='epoch',
                learning_rate=learning_rate,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=num_epochs,
                logging_dir='./logs',
                push_to_hub=True,
                hub_model_id=repo_id,
            )
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
            )
            if st.button('Start Training'):
                st.write("Training model...")
                trainer.train()
                trainer.push_to_hub()
                st.write(f"Model pushed to: {repo_id}")
                
    # Integrate AppointmentScheduler
    st.header("Appointment Scheduler")
    
    # Initialize session state for conversation history and scheduler
    if 'conversation_history' not in st.session_state:
        st.session_state.conversation_history = []
        st.session_state.scheduler = AppointmentScheduler()
        st.session_state.first_interaction = True

    user_input = st.text_input("Enter patient response")
    if user_input:
        # If it's the first interaction, start with the greeting
        if st.session_state.first_interaction:
            response = st.session_state.scheduler.handle_incoming_speech("hello")
            st.session_state.conversation_history.append(("Assistant", response))
            st.session_state.first_interaction = False

        # Use AppointmentScheduler to handle the response
        response = st.session_state.scheduler.handle_incoming_speech(user_input)
        st.session_state.conversation_history.append(("Patient", user_input))
        st.session_state.conversation_history.append(("Assistant", response))

    # Display conversation history
    for speaker, message in st.session_state.conversation_history:
        st.write(f"{speaker}: {message}")

if __name__ == "__main__":
    main()