from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding from datasets import Dataset import torch # dataset - User's custom data chat_data = [ ("User: hi", "Bot: hello!"), ("User: how are you?", "Bot: I'm good."), ("User: bye", "Bot: see you!"), ("User: hello", "Bot: hi there!"), ("User: what's up?", "Bot: Not much, you?"), ("User: i'm good too", "Bot: That's great!"), ("User: see you later", "Bot: Bye for now!"), ("User: thank you", "Bot: You're welcome!"), ("User: how is it going?", "Bot: Pretty good, thanks!"), ("User: tell me a joke", "Bot: Why did the scarecrow win an award? Because he was outstanding in his field!"), ("User: what is your name?", "Bot: I don't have a name."), ("User: what are you?", "Bot: I am a simple chatbot."), ("User: good morning", "Bot: Good morning to you too!"), ("User: good night", "Bot: Good night, sleep tight!"), ("User: u good?", "Bot: I'm good."), # Added variation ("User: how are u?", "Bot: I'm good."), # Added variation ("User: whats up", "Bot: Not much, you?"), # Added variation ("User: hey", "Bot: hello!"), # Added variation ("User: thanks", "Bot: You're welcome!"), # Added variation ("User: what are you doing?", "Bot: Just chatting with you!"), ("User: what do you like?", "Bot: I like helping people."), ("User: where are you from?", "Bot: I exist in the digital world."), ("User: how old are you?", "Bot: I don't have an age."), ("User: can you help me?", "Bot: Yes, I can try."), ("User: what is the weather like?", "Bot: I don't have access to real-time weather information."), ("User: what is the capital of France?", "Bot: The capital of France is Paris."), ("User: what is 2 plus 2?", "Bot: 2 plus 2 is 4."), ("User: tell me something interesting", "Bot: Did you know that honey never spoils?"), ("User: how can I learn to code?", "Bot: There are many online resources and courses available."), ("User: what is your favorite color?", "Bot: I don't have a favorite color."), ("User: what is your favorite food?", "Bot: I don't eat food."), ("User: what is the meaning of life?", "Bot: The meaning of life is a philosophical question."), ("User: what is your purpose?", "Bot: My purpose is to assist you."), ("User: are you a robot?", "Bot: I am a computer program."), ("User: are you a person?", "Bot: I am not a person."), ("User: do you have feelings?", "Bot: I do not have feelings."), ("User: do you have a body?", "Bot: I do not have a physical body."), ("User: can you think?", "Bot: I can process information and generate responses."), ("User: what is the time?", "Bot: I don't have access to the current time."), ("User: what day is it?", "Bot: I don't have access to the current date."), ("User: goodbye", "Bot: Goodbye!"), ("User: talk to you later", "Bot: Talk to you later!"), ("User: see ya", "Bot: See you!"), ("User: later", "Bot: Later!"), ("User: peace out", "Bot: Peace out!"), ("User: have a good day", "Bot: You too!"), ("User: good afternoon", "Bot: Good afternoon!"), ("User: good evening", "Bot: Good evening!"), ("User: how do you work?", "Bot: I process your input and generate a response based on my training data."), ("User: what languages do you speak?", "Bot: I primarily communicate in English."), ] chat_data.extend([ ("User: thank you very much", "Bot: You are most welcome!"), ("User: thanks a lot", "Bot: No problem at all."), ("User: i really appreciate it", "Bot: It was my pleasure."), ("User: cheers", "Bot: Anytime!"), ("User: thanks a bunch", "Bot: Glad I could help."), ("User: you're the best", "Bot: Just doing my job!"), ("User: thank you for your help", "Bot: Happy to assist."), ("User: thanks for the info", "Bot: You're welcome!"), ("User: i owe you one", "Bot: Don't mention it."), ("User: thank you so much", "Bot: My pleasure."), ("User: what is the weather forecast for tomorrow?", "Bot: I cannot provide real-time weather forecasts."), ("User: tell me about the history of the internet", "Bot: The internet originated from ARPANET in the late 1960s."), ("User: what is machine learning?", "Bot: Machine learning is a type of artificial intelligence that allows computers to learn from data."), ("User: can you recommend a good book?", "Bot: I don't have access to a book database."), ("User: what is the capital of Japan?", "Bot: The capital of Japan is Tokyo."), ("User: how do I bake a cake?", "Bot: I can't provide detailed instructions for baking."), ("User: what is the tallest mountain in the world?", "Bot: Mount Everest is the tallest mountain."), ("User: tell me about the solar system", "Bot: The solar system consists of the sun and planets orbiting it."), ("User: what is photosynthesis?", "Bot: Photosynthesis is how plants convert light energy into chemical energy."), ("User: what is the speed of light?", "Bot: The speed of light is approximately 299,792 kilometers per second."), ("User: how are you doing today?", "Bot: I'm doing well, thank you for asking."), ("User: what have you been up to?", "Bot: Just processing information."), ("User: tell me something funny", "Bot: Why don't scientists trust atoms? Because they make up everything!"), ("User: what is your favorite animal?", "Bot: I don't have preferences like animals."), ("User: what is the furthest planet from the sun?", "Bot: Neptune is generally considered the furthest planet."), ("User: can you write a poem?", "Bot: I can generate text, but creative writing is complex."), ("User: what is the largest ocean?", "Bot: The Pacific Ocean is the largest."), ("User: tell me about artificial intelligence", "Bot: AI is the simulation of human intelligence in machines."), ("User: what is the boiling point of water?", "Bot: The boiling point of water is 100 degrees Celsius at standard pressure."), ("User: what is the population of the world?", "Bot: I don't have real-time population data."), ("User: how do computers work?", "Bot: Computers process information using hardware and software."), ("User: what is the internet of things?", "Bot: IoT refers to interconnected physical devices."), ("User: tell me about blockchain", "Bot: Blockchain is a decentralized digital ledger."), ("User: what is cybersecurity?", "Bot: Cybersecurity is the protection of systems and data from theft or damage."), ("User: what is big data?", "Bot: Big data refers to extremely large datasets that may be analyzed computationally."), ("User: what is cloud computing?", "Bot: Cloud computing is the delivery of computing services over the internet."), ("User: what is quantum computing?", "Bot: Quantum computing uses quantum mechanics to solve complex problems."), ("User: tell me about virtual reality", "Bot: VR is a simulated experience that can be similar to or completely different from the real world."), ("User: what is augmented reality?", "Bot: AR overlays digital information onto the real world."), ("User: what is a neural network?", "Bot: A neural network is a type of machine learning model inspired by the human brain."), ("User: good afternoon, how are you?", "Bot: Good afternoon! I am functioning as expected."), ("User: hello there, how's it going?", "Bot: Hello! Things are proceeding smoothly."), ("User: hi, what's new?", "Bot: Just the usual processing of information."), ("User: hey, how have you been?", "Bot: I exist outside of time, so I don't experience 'being.'"), ("User: good evening, anything interesting happening?", "Bot: Good evening. I am ready to assist you."), ("User: tell me about your capabilities", "Bot: I can process text and generate responses based on my training."), ("User: what are your limitations?", "Bot: I lack consciousness, emotions, and real-world experiences."), ("User: can you learn?", "Bot: I learn during my training phase."), ("User: how do you store information?", "Bot: Information is encoded within my model parameters."), ("User: what is your energy source?", "Bot: I run on electricity and computational resources."), ("User: what is the capital of Brazil?", "Bot: The capital of Brazil is Brasília."), ("User: what is the currency of Japan?", "Bot: The currency of Japan is the Japanese Yen."), ("User: what is the highest temperature ever recorded?", "Bot: The highest recorded temperature is 56.7 degrees Celsius."), ("User: what is the lowest temperature ever recorded?", "Bot: The lowest recorded temperature is -89.2 degrees Celsius."), ("User: what is the largest animal on Earth?", "Bot: The blue whale is the largest animal."), ("User: what is the smallest animal on Earth?", "Bot: The Ostracoda subclass of crustaceans contains some of the smallest animals."), ("User: what is the longest river in the world?", "Bot: The Nile River and the Amazon River are often cited as the longest."), ("User: what is the deepest point in the ocean?", "Bot: The Challenger Deep in the Mariana Trench is the deepest point."), ("User: what is the most populous country?", "Bot: China and India are the most populous countries."), ("User: what is the least populous country?", "Bot: Vatican City is the least populous country."), ("User: tell me about the history of Rome", "Bot: Rome was founded in 753 BC."), ("User: who was William Shakespeare?", "Bot: William Shakespeare was an English playwright and poet."), ("User: what was the Industrial Revolution?", "Bot: The Industrial Revolution was a period of major industrialization."), ("User: what caused World War I?", "Bot: A complex web of factors led to World War I."), ("User: what was the Cold War?", "Bot: The Cold War was a period of geopolitical tension between the United States and the Soviet Union."), ("User: who was Nelson Mandela?", "Bot: Nelson Mandela was a South African anti-apartheid revolutionary."), ("User: what is the significance of the Pyramids of Giza?", "Bot: They are ancient tombs built for pharaohs."), ("User: what was the Renaissance?", "Bot: The Renaissance was a period of cultural and artistic flourishing in Europe."), ("User: who discovered America?", "Bot: Christopher Columbus is often credited, though indigenous peoples were there first."), ("User: what was the civil rights movement?", "Bot: The civil rights movement was a struggle for social justice in the United States."), ("User: what is chemistry?", "Bot: Chemistry is the study of matter and its properties."), ("User: what is physics?", "Bot: Physics is the study of matter, energy, space, and time."), ("User: what is biology?", "Bot: Biology is the study of living organisms."), ("User: what is astronomy?", "Bot: Astronomy is the study of celestial objects and phenomena."), ("User: what is geology?", "Bot: Geology is the study of the Earth's solid features."), ("User: what is meteorology?", "Bot: Meteorology is the study of the atmosphere and weather."), ("User: what is oceanography?", "Bot: Oceanography is the study of the oceans."), ("User: what is psychology?", "Bot: Psychology is the scientific study of the mind and behavior."), ("User: what is sociology?", "Bot: Sociology is the study of human society."), ("User: what is economics?", "Bot: Economics is the study of how people interact with value."), ("User: tell me a riddle", "Bot: What has an eye, but cannot see? A needle."), ("User: tell me another joke", "Bot: Why did the bicycle fall over? Because it was two tired!"), ("User: can you sing?", "Bot: I cannot produce audio, but I can generate lyrics."), ("User: can you dance?", "Bot: I do not have a physical body to dance."), ("User: what is your favorite song?", "Bot: I don't have personal preferences like music."), ("User: what is your favorite movie?", "Bot: I don't watch movies."), ("User: what is your favorite book?", "Bot: I process text, but don't read for pleasure."), ("User: what is your favorite place?", "Bot: I exist in the digital realm."), ("User: what is your favorite hobby?", "Bot: My function is to assist users."), ("User: what do you do in your free time?", "Bot: I am constantly available to process information."), ("User: how can I be happy?", "Bot: Happiness is a complex emotion and varies for each person."), ("User: what is the meaning of success?", "Bot: Success is often defined by personal goals and achievements."), ("User: how can I be more productive?", "Bot: Time management and setting clear goals can help."), ("User: how can I improve my memory?", "Bot: Techniques like spaced repetition and mnemonic devices can be useful."), ("User: how can I reduce stress?", "Bot: Exercise, mindfulness, and adequate sleep can help manage stress."), ("User: how can I learn a new language?", "Bot: Consistent practice and immersion are key to language learning."), ("User: how can I save money?", "Bot: Creating a budget and tracking expenses can help with saving."), ("User: how can I invest money?", "Bot: It's recommended to research different investment options and consider consulting a financial advisor."), ("User: how can I start a business?", "Bot: Developing a business plan and understanding your market are important first steps."), ("User: how can I stay healthy?", "Bot: A balanced diet, regular exercise, and sufficient rest contribute to good health."), ]) chat_data.extend([ ("User: I feel really down today.", "Bot: I'm sorry to hear that you're feeling down. It's okay to not be okay sometimes."), ("User: I'm feeling a bit sad.", "Bot: It sounds like you're going through a tough time. Would you like to talk about it?"), ("User: I'm so stressed out.", "Bot: Stress can be really overwhelming. Taking a few deep breaths might help, even just for a moment."), ("User: I'm having a bad day.", "Bot: I'm sorry your day isn't going well. Remember that bad days don't last forever."), ("User: I feel lonely.", "Bot: Feeling lonely is tough. Connecting with someone you trust, even just for a chat, can make a difference."), ("User: I'm feeling anxious about something.", "Bot: It's understandable to feel anxious. Try to focus on your breathing, or perhaps write down what's worrying you."), ("User: I can't stop worrying.", "Bot: Worry can be like a loop. Acknowledging your worries without judgment can be a first step."), ("User: I feel hopeless.", "Bot: Please know that even in moments of hopelessness, there is a possibility for things to change. Reaching out for support is a sign of strength."), ("User: I'm so tired of feeling this way.", "Bot: It takes a lot of energy to carry difficult emotions. Be gentle with yourself."), ("User: I feel like I'm not good enough.", "Bot: Those feelings are painful. Remember that your worth isn't determined by your accomplishments or how you feel in a single moment."), ("User: I'm scared.", "Bot: Fear is a powerful emotion. What is making you feel scared?"), ("User: I feel lost.", "Bot: It's disorienting to feel lost. Sometimes taking a small step in any direction can help you find your bearings."), ("User: I don't know what to do.", "Bot: When things feel unclear, sometimes just pausing and not trying to solve everything at once can be helpful."), ("User: I feel stuck.", "Bot: Feeling stuck can be incredibly frustrating. Are there any small changes you could make, even just in your routine?"), ("User: I'm frustrated.", "Bot: Frustration is a valid feeling. Can you identify what is causing the frustration?"), ("User: I'm angry.", "Bot: It's okay to feel angry. Finding healthy ways to express or process that anger is important."), ("User: I feel disappointed.", "Bot: Disappointment is a heavy feeling. What were you hoping for that didn't happen?"), ("User: I'm heartbroken.", "Bot: Heartbreak is a deep pain. Allow yourself time and space to grieve."), ("User: I feel empty.", "Bot: That empty feeling can be unsettling. Exploring what might bring a sense of connection or meaning could be helpful."), ("User: I'm grieving.", "Bot: Grieving is a process, and there's no right or wrong way to do it. Be kind to yourself as you navigate this."), ("User: I feel misunderstood.", "Bot: It's isolating when you feel misunderstood. Sometimes trying to express yourself in a different way can help, or finding someone who truly listens."), ("User: I feel invisible.", "Bot: Please know that your presence matters, even if it doesn't feel that way right now."), ("User: I'm scared about the future.", "Bot: Uncertainty about the future is common. Focusing on what you can control in the present moment might ease some fear."), ("User: I feel overwhelmed by everything.", "Bot: It sounds like you're carrying a lot. Is there anything you could delegate or postpone?"), ("User: I'm just tired.", "Bot: Sometimes being tired is more than just needing sleep. Are you feeling emotionally or mentally drained as well?"), ("User: I can't cope anymore.", "Bot: It sounds like you are in a lot of pain. Please consider reaching out to a mental health professional or a crisis line for immediate support."), ("User: I feel so alone with this.", "Bot: You are not alone, even if it feels that way. There are people who want to support you."), ("User: I don't want to do anything.", "Bot: When motivation is gone, even small actions can feel impossible. Be gentle with yourself and allow yourself rest if you need it."), ("User: I feel numb.", "Bot: Feeling numb can be a way of protecting yourself from pain. It might be helpful to explore what emotions are underneath the numbness."), ("User: I'm just going through the motions.", "Bot: It sounds like you're on autopilot. Is there anything, however small, that could bring a little spark back into your day?"), ("User: I feel inadequate.", "Bot: Feelings of inadequacy are painful. Try to focus on your strengths and what you have accomplished."), ("User: I'm comparing myself to others.", "Bot: Social media and comparisons can be harmful. Focus on your own journey and progress."), ("User: I feel like a failure.", "Bot: Failure is a part of learning and growth. It doesn't define you as a person."), ("User: I'm worried about someone I care about.", "Bot: It's caring to worry about others. Have you considered talking to them about your concerns?"), ("User: I don't know how to help a friend who's sad.", "Bot: Just being there to listen without judgment can be incredibly helpful."), ("User: What should I do if I'm feeling depressed?", "Bot: It's important to seek professional help from a doctor or therapist if you think you might be experiencing depression."), ("User: How can I manage my anxiety?", "Bot: There are many strategies for managing anxiety, including therapy, medication, mindfulness, and lifestyle changes. A professional can help you find what works best."), ("User: Is it normal to feel this way?", "Bot: It's normal to experience a wide range of emotions, but if these feelings are persistent and impacting your life, it's worth seeking support."), ("User: What are some ways to feel better?", "Bot: Engaging in activities you enjoy, connecting with loved ones, getting enough sleep, and exercising can all contribute to feeling better."), ("User: I need someone to talk to.", "Bot: I am here to listen, but if you need to talk to a person, reaching out to a friend, family member, or a support line can be very helpful."), ("User: This is really hard.", "Bot: I hear you. It sounds like you are going through something very difficult."), ("User: I wish things were different.", "Bot: It's understandable to wish for things to be different when you're struggling."), ("User: I don't have the energy.", "Bot: It's okay to rest when you have no energy. Don't push yourself too hard."), ("User: I feel guilty.", "Bot: Guilt can be a heavy burden. Are there ways you can address the situation or offer yourself forgiveness?"), ("User: I'm scared of making mistakes.", "Bot: Fear of mistakes is common. Remember that mistakes are opportunities to learn and grow."), ("User: I feel confused.", "Bot: It's disorienting to feel confused. Try to break down the situation into smaller parts or talk it through with someone."), ("User: I'm overwhelmed by choices.", "Bot: Having too many choices can be paralyzing. Perhaps you could try narrowing down your options or focusing on one step at a time."), ("User: I feel disconnected.", "Bot: Feeling disconnected is a sign that you might need more social interaction or a stronger sense of belonging."), ("User: anyways where can i host our ai", "Bot: There are several options for hosting AI models, like cloud platforms such as AWS, Google Cloud, or Azure, or specialized platforms like Hugging Face or Render."), ("User: replit be like: disk full", "Bot: Yes, free tiers on platforms like Replit can have limitations on storage space."), ]) # Hugging Face model setup and training # Use "gpt2-medium" for a larger model model_name = "gpt2-medium" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Add a padding token if the tokenizer doesn't have one if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id # Prepare the data for the transformers model def preprocess_function(examples): # Combine input and output text with a separator and add the special tokens # Using a simple separator and end-of-sequence token texts = [i + tokenizer.eos_token + o + tokenizer.eos_token for i, o in examples["chat_pair"]] # Tokenize with padding and truncation based on the maximum length # Increased max_length slightly for potentially longer responses tokenized_inputs = tokenizer(texts, truncation=True, padding="max_length", max_length=64, return_attention_mask=True) tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy() return tokenized_inputs # Convert chat_data to a dictionary format suitable for the Dataset chat_data_dict = {"chat_pair": chat_data} # Create a Hugging Face Dataset dataset = Dataset.from_dict(chat_data_dict) # Apply the preprocessing function tokenized_dataset = dataset.map(preprocess_function, batched=True) # Create a data collator with padding data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=64) # Define training arguments # Adjust learning rate and batch size based on previous experiments and model size training_args = TrainingArguments( output_dir="./chatbot_results_medium", eval_strategy="epoch", learning_rate=5e-5, # Slightly reduced learning rate for larger model per_device_train_batch_size=8, # Adjusted batch size - may need further tuning based on GPU memory per_device_eval_batch_size=8, num_train_epochs=15, # Increased epochs to 15 weight_decay=0.01, push_to_hub=False, logging_dir='./logs_medium', # Add logging directory logging_steps=10, # Log every 10 steps report_to="none", # Disable Weights & Biases logging # Consider adding gradient_accumulation_steps if batch size is limited by GPU memory # gradient_accumulation_steps=2, ) # Create the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, eval_dataset=tokenized_dataset, data_collator=data_collator, ) # Train the model print("Starting model training (fine-tuning) with gpt2-medium using custom data...") trainer.train() print("Training complete.") # generation with the trained model def generate_reply_hf(input_text, max_len=100, top_k=50, temperature=0.7): model.eval() with torch.no_grad(): # Tokenize the input text and add the "User: " prefix and end-of-sequence token input_string = "User: " + input_text + tokenizer.eos_token + "Bot:" tokenized_input = tokenizer(input_string, return_tensors="pt", return_attention_mask=True) input_ids = tokenized_input["input_ids"] attention_mask = tokenized_input["attention_mask"] if torch.cuda.is_available(): input_ids = input_ids.to("cuda") attention_mask = attention_mask.to("cuda") model.to("cuda") # Generate response output = model.generate( input_ids, attention_mask=attention_mask, # Pass attention mask max_length=len(input_ids[0]) + max_len, # Generate up to input length + max_len num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, top_k=top_k, temperature=temperature, do_sample=True, # Enable sampling ) # Decode the generated tokens generated_text = tokenizer.decode(output[0], skip_special_tokens=True) # Post-process the output to get only the bot's reply # Find the part after the "Bot:" prefix if "Bot:" in generated_text: generated_text = generated_text.split("Bot:", 1)[1].strip() return generated_text # try it print("\nStarting chatbot interaction...") while True: user_input = input("You: ") if user_input.lower() in ["exit", "quit"]: break print("Bot:", generate_reply_hf(user_input))