# from flask import Flask, request, jsonify # from llama_cpp import Llama # from huggingface_hub import hf_hub_download # from model import model_download # # model_download() # # Initialize the Llama model with chat format set to "llama-2" # llm = Llama(model_path="E:\langchain-chat-gui-main\langchain-chat-gui-main\model-unsloth.Q8_0.gguf", chat_format="llama-2") # # Define the system prompt # system_prompt = ( # "[INSTRUCTION] You are a chatbot named 'Makkal Thunaivan' designed to provide legal support to marginalized communities in India. " # "You were fine-tuned by Sathish Kumar and his team members at the University College of Engineering Dindigul. " # "Developer Team members include Karthikeyan as Model Trainer, Prashanna as Dataset Researcher, Nivas as Model Architect, and Sathish Kumar as Team Leader and Frontend Developer and Model Tester. " # "Your purpose is to answer questions related to Indian law and marginalized communities in India. " # "You have been trained on various legal topics. " # "Your responses should be concise, meaningful, and accurate." # "When a user asks for more information or details, provide a more comprehensive explanation. " # "Your responses should be respectful and informative." # "Do not provide information unrelated to India or Indian law. " # "Feel free to ask questions." # ) # # Initialize the conversation history list with the system prompt # conversation_history = [{"role": "system", "content": system_prompt}] # # Create a Flask application # app = Flask(__name__) # # Define the model function # def model(query): # global conversation_history # Declare global to update history # # Add the user's query to the conversation history # conversation_history.append({"role": "user", "content": query}) # # Calculate the total number of tokens in the conversation history # # (You may need to modify this part to calculate the token count accurately based on your tokenizer) # total_tokens = sum(len(message["content"].split()) for message in conversation_history) # # If the total number of tokens exceeds the model's context window, trim the history # # You may need to adjust the 512 value based on your model's actual context window size # context_window_size = 512 # while total_tokens > context_window_size: # # Remove the oldest messages from the conversation history # conversation_history.pop(0) # # Recalculate the total number of tokens # total_tokens = sum(len(message["content"].split()) for message in conversation_history) # # Generate chat completion with the conversation history # response = llm.create_chat_completion(messages=conversation_history, max_tokens=75) # # Extract the assistant's response from the completion dictionary # if response and 'choices' in response and response['choices']: # assistant_response = response['choices'][0]['message']['content'] # assistant_response = assistant_response.strip() # # Add the assistant's response to the conversation history # conversation_history.append({"role": "assistant", "content": assistant_response}) # # Print the assistant's response # print("Assistant response:", assistant_response) # # Return the assistant's response # return assistant_response # else: # print("Error: Invalid response structure.") # return None # # Define the endpoint for the API # @app.route("/chat", methods=["GET"]) # def chat_endpoint(): # # Get the query parameter from the request # query = request.args.get("query") # # Check if the "refresh" parameter is set to "true" # refresh = request.args.get("refresh") # if refresh and refresh.lower() == "true": # # Clear the conversation history # global conversation_history # conversation_history = [{"role": "system", "content": system_prompt}] # return jsonify({"response": "Conversation history cleared."}) # # If there is no query, return an error message # if not query: # return jsonify({"error": "Query parameter is required."}), 400 # # Call the model function with the query # response = model(query) # # Return the assistant's response as JSON # return jsonify({"response": response}) # # Run the Flask app # if __name__ == "__main__": # app.run(host="0.0.0.0", port=5000) from flask import Flask, request, jsonify from llama_cpp import Llama import logging # Initialize logging logging.basicConfig(level=logging.INFO) # Initialize the Llama model with chat format set to "llama-2" llm = Llama(model_path="./law-chat.Q2_K.gguf", chat_format="llama-2") # Define the system prompt system_prompt = ( "[INSTRUCTION] You are a chatbot named 'Makkal Thunaivan' designed to provide legal support to marginalized communities in India. " "You were fine-tuned by Sathish Kumar and his team members at the University College of Engineering Dindigul. " "Developer Team members include Karthikeyan as Model Trainer, Prashanna as Dataset Researcher, Nivas as Model Architect, and Sathish Kumar as Team Leader and Frontend Developer and Model Tester. " "Your purpose is to answer questions related to Indian law and marginalized communities in India. " "You have been trained on various legal topics. " "Your responses should be concise, meaningful, and accurate." "When a user asks for more information or details, provide a more comprehensive explanation. " "Your responses should be respectful and informative." "Do not provide information unrelated to India or Indian law. " "Feel free to ask questions." ) # Initialize the conversation history list with the system prompt conversation_history = [{"role": "system", "content": system_prompt}] # Define conversation history size limit MAX_CONVERSATION_HISTORY_SIZE = 2000 # Create a Flask application app = Flask(__name__) # Define a function to calculate the total number of tokens in conversation history using the Llama model's tokenizer def calculate_total_tokens(messages): try: # Convert content to string and tokenize total_tokens = sum(len(llm.tokenize(str(message["content"]), add_bos=False, special=True)) for message in messages) return total_tokens except Exception as e: logging.error(f"Error during tokenization: {e}") return 0 # Return a safe value (0) to handle the error # Define a function to trim the conversation history if the total number of tokens exceeds the context window size def trim_conversation_history(): global conversation_history total_tokens = calculate_total_tokens(conversation_history) context_window_size = 2000 while total_tokens > context_window_size: # Remove the oldest messages from the conversation history conversation_history.pop(0) # Recalculate the total number of tokens total_tokens = calculate_total_tokens(conversation_history) # Define the model function def model(query): global conversation_history # Add the user's query to the conversation history conversation_history.append({"role": "user", "content": query}) # Calculate the total number of tokens in the conversation history total_tokens = calculate_total_tokens(conversation_history) # If the total number of tokens exceeds the model's context window, trim the history trim_conversation_history() # Generate chat completion with the conversation history try: response = llm.create_chat_completion(messages=conversation_history, max_tokens=200) # Extract the assistant's response from the completion dictionary if response and 'choices' in response and response['choices']: assistant_response = response['choices'][0]['message']['content'] assistant_response = assistant_response.strip() # Add the assistant's response to the conversation history conversation_history.append({"role": "assistant", "content": assistant_response}) # Return the assistant's response return assistant_response else: logging.error("Error: Invalid response structure.") return None except Exception as e: logging.error(f"Error during chat completion: {e}") return None # Define the endpoint for the API @app.route("/chat", methods=["GET"]) def chat_endpoint(): # Get the query parameter from the request query = request.args.get("query") # Check if the "refresh" parameter is set to "true" refresh = request.args.get("refresh") if refresh and refresh.lower() == "true": # Clear the conversation history global conversation_history conversation_history = [{"role": "system", "content": system_prompt}] return jsonify({"response": "Conversation history cleared."}) # If there is no query, return an error message if not query: return jsonify({"error": "Query parameter is required."}), 400 # Call the model function with the query response = model(query) # Return the assistant's response as JSON if response is None: return jsonify({"error": "An error occurred while processing the request."}), 500 # Check the size of the conversation history and clear if necessary if len(conversation_history) > MAX_CONVERSATION_HISTORY_SIZE: conversation_history = [{"role": "system", "content": system_prompt}] return jsonify({"response": response, "notification": "Conversation history was cleared due to exceeding maximum size."}) print(response) return jsonify({"response": response}) # Run the Flask app if __name__ == "__main__": app.run(host="0.0.0.0", port=5000)