import pandas as pd from src.services.generate_embedding import generate_embedding from pyvi.ViTokenizer import tokenize import pymongo SHEET_ID = "1MKB6MHgL_lrPB1I69fj2VcVrgmSAMLVNZR1EwSyTSeA" SHEET_NAME = "Q&A" # Connect DB client = pymongo.MongoClient( "mongodb+srv://rag:p9vojYc9fafYwxE9@rag.xswi7nq.mongodb.net/?retryWrites=true&w=majority&appName=RAG" ) db = client.rag collection = db.questionAndAnswers def insertQuestionAndAnswers(questionAndAnswers): for index, questionAndAnswer in enumerate(questionAndAnswers): print(f"""inserting {index}: {questionAndAnswer['question']}""") collection.insert_one(questionAndAnswer) def deleteByUserId(user_id: str): return collection.delete_many({'user_id': user_id}) def readDataFromGoogleSheet(sheet_id: str, sheet_name: str): url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}" df = pd.read_csv(url) items = [] for index, row in df.iterrows(): items.append( { "question": row["Question"], "answer": row["Answer"], } ) print(f'read from google sheet {df.size} items') return items def indexData(sheet_id: str, sheet_name: str): items = readDataFromGoogleSheet(sheet_id, sheet_name) questionAndAnswers = [] for item in items: tokenized_question = tokenize(item["question"]) questionAndAnswer = { "question": tokenized_question, "answer": item["answer"], "question_embedding": generate_embedding(tokenized_question), "user_id": sheet_id, } questionAndAnswers.append(questionAndAnswer) deleteByUserId(sheet_id) print(f'deleted dta of sheet {sheet_id}') insertQuestionAndAnswers(questionAndAnswers) # for index, article in enumerate(data): # if(index< 6580): # continue; # if(len(str(article['title'])) == 0 or len(str(article['description'])) == 0 or len(str(article['link'])) == 0 ): # continue # tokenized_title = tokenize(article['title']) # tokenized_description = tokenize(article['description']) # article = { # 'title': tokenized_title, # 'description': tokenized_description, # 'link': article['link'], # # 'title_embedding': generate_embedding(tokenized_title), # 'title_embedding': [], # 'description_embedding': generate_embedding(tokenized_title + ": " + tokenized_description), # } # print(f"processed {index}/{len(articles)}") # save_db(article)