import pandas as pd import os import json import openai import ast from langchain.vectorstores import Chroma from langchain.embeddings import OpenAIEmbeddings from langchain.document_loaders import JSONLoader from langchain.vectorstores.chroma import Chroma from langchain.chat_models import ChatOpenAI from langchain.embeddings.openai import OpenAIEmbeddings from langchain.chains import ConversationalRetrievalChain from langchain.schema import HumanMessage, AIMessage from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI import json from pathlib import Path from langchain.chat_models import ChatOpenAI from langchain import PromptTemplate, LLMChain from langchain.prompts.chat import ( ChatPromptTemplate, SystemMessagePromptTemplate, AIMessagePromptTemplate, HumanMessagePromptTemplate, ) from langchain.schema import AIMessage, HumanMessage, SystemMessage from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI from fastapi import FastAPI, HTTPException, Body, File, Form, UploadFile from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import FileResponse import uvicorn import datetime from dotenv import load_dotenv # from index_store.methods.common import query_jsons, remove_unwanted_string_json # from index_store.methods.file_handler import save_file # from index_store.methods.gpt import get_desc_llm, revalidate_final_response_gpt, survey_answer_llm # from index_store.methods.indexing import data_cleanup, questions_semantic_search, store_to_index_db os.environ["OPENAI_API_KEY"] = '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' openai.api_key ='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' os.environ["ROOT_FOLDER"] = os.getcwd() app = FastAPI() origins = ['*'] app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"] ) @app.get("/") async def greet(): return {"message": "Welcome to the survey bot api"} @app.post("/store") async def index_file_vector_db(file:UploadFile,iso:str): print("--------------------------------------------------------------------") if file is not None: await save_file('index_store/surveys_storage_xls',file) file_path=os.environ["ROOT_FOLDER"]+'/index_store/surveys_storage_xls/'+file.filename print(file_path) json_data=data_cleanup(file_path) collection_name=store_to_index_db(json_data,iso) print("File updated successfully and stored in DB. collection_name :", collection_name) print("--------------------------------------------------------------------") return {"message": "File updated successfully and stored in DB","collection_name":collection_name} @app.post("/search") async def get_survey_quesitons_answers(query:str,iso:str): print("--------------------------------------------------------------------") print("Query:",query) print("Iso:",iso) collection_name=iso.lower()+"-surveys"+"-v1" questions_data = questions_semantic_search(collection_name, query) get_questions_only=json.loads(questions_data["questions_without_answers"]) get_guestion_answers=questions_data["entire_questions_answeres"] # print(get_guestion_answers) questions_list = [] for item in get_questions_only: question = item["Question"] questions_list.append(question) questions_string = '\n'.join(questions_list) # print(questions_string) context=get_desc_llm(query) # print(context) answers=survey_answer_llm(context,questions_string) response_questions_json=json.loads(answers) # print(response_questions_json) # print("Answer -------->",(response_questions_json)) final_json=query_jsons(response_questions_json,get_guestion_answers) # remove_unwanted_string_json response=revalidate_final_response_gpt(context,final_json) print(type(response)) print("Processed Response------->",response) print("--------------------------------------------------------------------") return response # methods for chatbot--------------------------------------------------------------- # Common methods def query_jsons(answer_json:dict, whole_json:dict)->dict: # Initialize a list to store the results results = [] # Iterate over the questions in answer_json for question, answer in answer_json.items(): # Find the corresponding entry in whole_json matching_entry = next((entry for entry in whole_json if entry['Question'] == question), None) if matching_entry: # Extract the ID and answer options entry_id = matching_entry['ID'] answer_options = matching_entry['Answer_Options'] # Append the results to the list results.append({'ID': entry_id, 'Question': question, 'Answer_Options': answer_options}) print(results) return results def remove_unwanted_string_json(json_file:dict, uw_str:str)->dict: filtered_questions = [question for question in json_file['catalogQuestions'] if question['answer'] != uw_str] # Create a new dictionary with the filtered questions filtered_data = {'catalogQuestions': filtered_questions} return filtered_data # Data preprocessing methods def process_excel_data(excel_file): """ Process question and answer information from an Excel file, create a CSV file, and return True if successful. Args: excel_file (str): Path to the Excel file. Returns: bool: True if the function executed successfully, False otherwise. """ # try: # Extract file name and generate output file names base_name = os.path.splitext(os.path.basename(excel_file))[0] csv_file_name = f"{base_name}_output.csv" # Read the Excel file df = pd.read_excel(excel_file) output_data = [] folder_path = "doc_storage" if not os.path.exists(folder_path): os.makedirs(folder_path) # Iterate over the rows and extract the information for index, row in df.iterrows(): if pd.notna(row['Question ID']): question_id = row['Question ID'] question = row['Question'] answer_str = "" else: if pd.notna(row['Answers']): answers = row['Answers'] if answer_str != '': answer_str += ', ' answer_str += answers if index + 1 < len(df): nxt_row = df.iloc[index + 1] nxt_question_id = nxt_row['Question ID'] if pd.notna(nxt_question_id): data = { 'ID': int(question_id), 'Question': str(question).strip(), 'Answer_Options': str(answer_str), } output_data.append(data) # Convert the data to a DataFrame df_output = pd.DataFrame(output_data) output_dir = 'index_store/doc_storage' os.makedirs(output_dir, exist_ok=True) csv_file_path = os.path.join(output_dir, csv_file_name) df_output.to_csv(csv_file_path, index=False) # # Save the DataFrame to a CSV file # df_output.to_csv('./doc_storage/output.csv', index=False) # # Read the Excel file # df = pd.read_csv('./doc_storage/output.csv') # df=df['Question'] # # Save DataFrame as a text file # df.to_csv('./doc_storage/output.txt', sep='\t', index=False, header=False) def csv_to_json(csv_file_path): """ Reads a CSV file and converts it to a JSON file. Args: csv_file_path (str): Path to the CSV file. Returns: bool: True if the function executed successfully, False otherwise. """ # try: # Generate the output file name json_file_name = os.path.splitext(os.path.basename(csv_file_path))[0] + '.json' output_dir = 'index_store/doc_storage' json_file_path = os.path.join(output_dir, json_file_name) # Read the CSV file and convert it to JSON pd.read_csv(os.path.join(output_dir,csv_file_path)).to_json(json_file_path, orient='records') dummy_record = { 'id': 'No_ID', 'dummyQ': 'dummy_value2', 'dummyA': 'dummy_value3', # Add more fields as needed } with open(json_file_path, 'r') as json_file: json_data = json.load(json_file) json_data.insert(0, dummy_record) json_data.insert(1, dummy_record) json_data.insert(2, dummy_record) # Write the modified JSON data back to the file with open(json_file_path, 'w') as json_file: json.dump(json_data, json_file) return True # except Exception as e: # print(f"Error occurred while converting CSV to JSON: {e}") # return False def get_question_data(df, question, answer_options_flag=True): """ Retrieves the data of a specific question from a DataFrame. Args: df (pandas.DataFrame): The DataFrame containing the question data. question (str): The question to retrieve the data for. Returns: dict or None: The data of the question in a dictionary format, or None if the question is not found. """ # Filter the DataFrame based on the specified question filtered_df = df[df['Question'] == question] # Check if the filtered DataFrame is empty if filtered_df.empty: return None # Retrieve the answer options for the question answer_options = filtered_df['Answer_Options'].tolist() # Check if answer options exist and are not all NaN values if not answer_options or all(pd.isna(options) for options in answer_options): question_data = { 'id': str(filtered_df['ID'].iloc[0]), 'question': question } else: if answer_options_flag: question_data = { 'ID': str(filtered_df['ID'].iloc[0]), 'question': question, 'answer_options': answer_options } else: question_data = { 'ID': str(filtered_df['ID'].iloc[0]), 'question': question, } return question_data # File handler methods async def save_file(folder,file): folder_path = os.path.join(os.environ["ROOT_FOLDER"],folder) print("file name ----->",folder_path) # Create the folder if it doesn't exist if not os.path.exists(folder_path): os.makedirs(folder_path) file_path = os.path.join(folder_path, file.filename) with open(file_path, "wb") as f: f.write(await file.read()) def rem_documents(file_path): try: os.remove(file_path) print("File deleted successfully:", file_path) except FileNotFoundError: print("File not found:", file_path) except Exception as e: print("An error occurred while deleting the file:", str(e)) # gpt methods def get_keywords_chatgpt(query: str): """ Retrieves keywords from a sentence using OpenAI Chat API. Args: query (str): The input sentence or query. Returns: str: The response message containing the keywords generated by the OpenAI Chat model. """ # Set up the chat conversation with OpenAI Chat API completion = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are an expert in NLP, who can give keywords from a sentence by removing special characters. If the keyword is about a person, please include the gender."}, {"role": "user", "content": "Women who are full-time employed and aged 50"}, {"role": "assistant", "content": "['Female', 'Gender', 'Work', 'Occupation', 'Permanent','age']"}, {"role": "user", "content": query + " \n please give keywords only, no additional text"} ] ) # Print the total number of tokens used in the API call print("get_keywords_chatgpt > Total tokens used:", completion["usage"]["total_tokens"]) # Return the assistant's response return completion["choices"][0]["message"]["content"] def get_desc_llm(desc): """ Retrieves answers to survey questions based on a given description using the OpenAI Chat API. Args: q_json (str): The JSON representation of survey questions. desc (str): The description of the group. Returns: str: The response message containing the answers to the survey questions in the specified JSON format. """ # find my description: check_desc=f''' Description: you are part of a group which can be described as '{desc}' Thought: What do i know about myself? Show detailed observations and Action. Thought: What i don't know about myself? Don't entire show verbose, don't mention anything additional ''' completion = openai.ChatCompletion.create( model="gpt-3.5-turbo-0613", temperature=0.3, messages=[ {"role": "system", "content": "You are an expert at finding the context fromt the description"}, {"role": "user", "content": "You are part of a group which can be described as 'Married women with 2 children'"}, {"role": "assistant", "content": '''i am a women hence my gender is female, i have 2 children not sure about their age and gender, my relationship status is married. I know about: my gender, children, relationship status only '''}, {"role": "user", "content": check_desc } ] ) print("get_desc_llm > Total tokens used:", completion["usage"]["total_tokens"]) return completion["choices"][0]["message"]["content"] def survey_answer_llm(json_data,desc:str): data=get_answers_llm(json_data,desc) print(json.loads(data)) data_dict = json.loads(data) filtered_data = { key: value for key, value in data_dict.items() if value != "NaN" } filtered_data_json = json.dumps(filtered_data, indent=2) print(filtered_data_json) # # Filter out objects with answer as "NaN" # filtered_objects = [obj for obj in parsed_data['catalogQuestions'] if obj['Answer'] not in ['NaN', 'No', 'None', 'None of the above']] # # Update the catalogQuestions array with filtered objects # parsed_data['catalogQuestions'] = filtered_objects # # Convert the updated data back to JSON # updated_data = json.dumps(parsed_data) # unescaped_string = json.loads(updated_data) # Print the updated JSON data return filtered_data_json def filter_questions_chatgpt(questions: dict, decs: str): """ Filters relavent questions from json using OpenAI Chat API. Args: query (str): The input sentence or query. Returns: str: The response message containing the keywords generated by the OpenAI Chat model. """ # Set up the chat conversation with OpenAI Chat API # prompt = f'''You are Tan. You are described as "{decs}". # Find the relavent questions to you from below list of questions based on your description # {questions} # Return their IDs in below format: # ```["53","39", ...]``` # Don't add anything to the response other than above format. # ''' # Return their IDs in below format: # ```["53","39", ...]``` # Don't add anything to the response other than above format. prompt = f''' Questions: {questions} what are the 5 relevant quuestions in the above json for which we already know the answers from the description below Description: {decs} ''' print(prompt) completion = openai.ChatCompletion.create( model="gpt-3.5-turbo", temperature=0.3, messages=[ {"role": "system", "content": "You are an expert at selecting relavent questions from the list of qestions for which answers are available in description and returning in list format"}, {"role": "user", "content": prompt} ] ) # Print the total number of tokens used in the API call print("filter_questions_chatgpt > Total tokens used:", completion["usage"]["total_tokens"]) # Return the assistant's response return completion["choices"][0]["message"]["content"] def get_answers_llm(cont:str,q_json:dict): prompt=f''' You will be provided with a list of questions in JSON format and should respond to all questions based on the observation provided. If you are not sure about the answer, please respond with "NeC" in the JSON format for that question. If the answer is not available, please respond with "NaN" in the JSON format for that question. Context: {cont} Questions: {q_json} Before answering the questions, please make sure you have read the context and questions carefully. check if the answer is available in the context, if not, please remove that question from the JSON response. ''' print(prompt) completion = openai.ChatCompletion.create( model="gpt-3.5-turbo", temperature=0.4, messages=[ {"role": "system", "content": "You are an expert at answering questions based on the context, observation and action"}, {"role": "user", "content": "what is your age?"}, {"role": "assistant", "content": "50"}, {"role": "user", "content": prompt} ] ) print("get_answers_llm > Total tokens used:", completion["usage"]["total_tokens"]) return completion["choices"][0]["message"]["content"] def revalidate_final_response_gpt(context:str, json_response:dict )->dict: prompt=f''' You will be provided with a list of questions in JSON format and should select right answer from the answer_options strictly based on the context provided. If the answer is not available, remove that question from the JSON response. If answer is similar to "No", "None", "None of the above","I don't" remove that question from the JSON response. Context:{context} Questions:{json_response} Response should be in below format only, don't add anything to the response other than below format: {{ "catalogQuestions": [ {{"id": "42", "question": "What is your gender?", "answer": "Female"}}, {{"id": "632", "question": "What is your relationship status?", "answer": "Single, never married"}}, ] }} Before answering the questions, please make sure you have read the context and questions carefully. check if the answer is available in the context, if not, please remove that question from the JSON response. ''' print(prompt) response = openai.Completion.create( model="text-davinci-003", prompt=prompt, temperature=0.0, max_tokens=2000, top_p=1, frequency_penalty=0, presence_penalty=0 ) json_response=response.choices[0].text print("revalidate_final_response_gpt > Total tokens used:", response["usage"]["total_tokens"]) return json_response # indxing methods def data_cleanup(file_path): # file_path = 'index_store/surveys_storage_xls/Marketplace_Standard_Quals_EN_GBR.xlsx' file_name = os.path.basename(file_path) print(file_name) process_excel_data(file_path) csv_to_json(file_name.split('.')[0]+'_output.csv') return './index_store/doc_storage/'+file_name.split('.')[0]+'_output.json' def store_to_index_db(file_path,iso): # create index_store/doc_storage folder if not exists if not os.path.exists('./index_store/survey_storage_db'): os.makedirs('./index_store/survey_storage_db') collection_name = iso.lower()+"-surveys"+"-v1" file_name = os.path.basename(file_path) data = json.loads(Path(file_path).read_text()) loader = JSONLoader( file_path=file_path, jq_schema='.[]', text_content=False) data = loader.load() txt_file_path = './doc_storage/'+file_name.split('.')[0]+'loader.txt' with open(txt_file_path, 'w') as txt_file: for document in data: txt_file.write(str(document)) txt_file.write('\n') txt_file.write('----------------------------------------------------------') txt_file.write('\n') # convert collection_name to lower case embeddings = OpenAIEmbeddings() vector_store = Chroma.from_documents( data, embeddings, collection_name=collection_name, persist_directory="index_store/survey_storage_db", ) # Save DB locally vector_store.persist() return collection_name def questions_semantic_search(collection_name,desc): embedding = OpenAIEmbeddings() vector_store = Chroma( collection_name=collection_name, embedding_function=embedding, persist_directory="./index_store/survey_storage_db", ) question_list = set() # desc = "People who live in Scotland having a webcam and use Facebook or Tinder" keywords = ast.literal_eval(get_keywords_chatgpt(desc)) print("keywords---------------->",keywords) question_list = [] question_ids = set() for keyword in keywords: query = f"questions related to keyword '{keyword}'" # print("---------------->", keyword) docs = vector_store.similarity_search(query, k=6) for doc in docs: question_json = json.loads(doc.page_content) # print(doc.page_content) question_id = question_json["ID"] if question_id not in question_ids: question = { "ID": question_id, "Question": question_json["Question"], "Answer_Options": question_json["Answer_Options"] } question_list.append(question) question_ids.add(question_id) # create a new json with only ID and Question new_question_list = [] for question in question_list: new_question = { "ID": question["ID"], "Question": question["Question"] } new_question_list.append(new_question) questions_without_answers = json.dumps(new_question_list) # create a json object with entire question_list "whole_json" and "questions_without_answers" json_response = { "entire_questions_answeres": question_list, "questions_without_answers": questions_without_answers } # print(json_response) # questions_without_answers return json_response