from openai import OpenAI import google.generativeai as genai from crawler import extract_data import time import os from dotenv import load_dotenv import gradio as gr # from together import Together # from transformers import AutoModel, AutoTokenizer # from sklearn.metrics.pairwise import cosine_similarity # import torch # # load_dotenv("../.env") # os.environ["TOKENIZERS_PARALLELISM"] = "false" # together_client = Together( # api_key=os.getenv("TOGETHER_API_KEY"), # ) genai.configure(api_key=os.getenv("GEMINI_API_KEY")) gemini_query = genai.GenerativeModel('gemini-2.0-flash-exp') gemini_summarizer = genai.GenerativeModel('gemini-1.5-flash') perplexity_client = OpenAI(api_key=os.getenv("PERPLEXITY_API_KEY"), base_url="https://api.perplexity.ai") # gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # with torch.no_grad(): # model = AutoModel.from_pretrained('BM-K/KoSimCSE-roberta') # tokenizer = AutoTokenizer.from_pretrained('BM-K/KoSimCSE-roberta') # def cal_score(input_data): # similarity_scores = [] # # Initialize model and tokenizer inside the function # with torch.no_grad(): # inputs = tokenizer(input_data, padding=True, truncation=True, return_tensors="pt") # outputs = model.get_input_embeddings()(inputs["input_ids"]) # for ind in range(1, outputs.size(0)): # a, b = outputs[0], outputs[ind] # a = a.reshape(1, -1) # b = b.reshape(1, -1) # a_norm = torch.nn.functional.normalize(a, p=2, dim=1) # b_norm = torch.nn.functional.normalize(b, p=2, dim=1) # similarity_scores.append(cosine_similarity(a_norm, b_norm)) # Scalar value # return similarity_scores def get_answers( query: str ): context = extract_data(query, 1) # if len(context) > 1: # scores = cal_score( [query] + [answer['questionDetails'] for answer in context] ) # context = [context for _, context in sorted(zip(scores, context), key=lambda x: x[0], reverse=True)] # mean_score = sum(scores) / len(scores) # context = [ctx for score, ctx in zip(scores, context) if score >= mean_score] return context def get_gemini_query( message: str ): print(">>> Starting gemini query generation...") response = gemini_query.generate_content(message) print("Finished gemini query generation: ", response.text) return response.text def get_naver_answers( message: str ): print(">>> Starting naver extraction...") print("Question: ", message) if len(message) > 300: message = get_gemini_query(f"{message}\n 위의 내용을 짧은 제목으로 요약합니다. 제목만 보여주세요. 대답하지 마세요. 한국어로만 답변해주세요!!!") print( "Query: ", message) context = get_answers( message ) sorted_answers = [ f"{index}. 질문: {answer['questionDetails']}" + '\n' + f" 답변: {'. '.join(answer['answers'])} " + '\n' for (index, answer) in enumerate(context) ] document = '\n'.join(sorted_answers) return document def get_perplexity_answer( message: str ): print(">>> Starting perplexity extraction...") messages = [ { "role": "system", "content": ( "You are an artificial intelligence assistant and you need to " "engage in a helpful, CONCISE, polite question-answer conversation with a user." ), }, { "role": "user", "content": ( message ), }, ] response = perplexity_client.chat.completions.create( model="llama-3.1-sonar-small-128k-online", messages=messages ) return response.choices[0].message.content def chatFunction( history ): # MAX_TOKEN_LIMIT = 58000 start_time = time.time() message = history[-1][0] # content = f' 질문과 답변으로 구성된 문서를 드리겠습니다. \ # 아래에 제공된 질문에 답하기 위해 중요한 정보를 추출하세요. \ # 한국어로만 답변하세요. 구체적이지만 간결하게 작성하세요. \ # 실제 보험상담사가 답변을 하듯이 친절한 답변을 해 주세요. \n 질문: {message}\n 문서: ' content = f' 보험설계사가 답을 줘서, 더 많은 질문이나 합당한 보험에 가입할 수 있도록 답변을 하려고 합니다. \ 문서에 있는 제3자 언급을 1인칭으로 ​​바꾸세요. 예를 들어 "KB손해보험 설계사 OOO입니다" 등 제3자가 언급된 경우 "보험기관입니다"로 대체합니다. \ 이러한 답변을 통해서 질문자가 이 답변을 보고 보험설계사에게 더 신뢰를 갖고 추가 질문이 있으면 물어볼 수 있도록 하려고 합니다. \ 실제 보험상담사가 답변을 하듯이 친절한 답변을 해 주세요. \n 질문: {message}\n 문서: ' naver_docs = get_naver_answers( message ) print(len(naver_docs)) # if len(naver_docs) > MAX_TOKEN_LIMIT: # print("HERE") # start_tmp = time.time() # overlap = 200 # answers = [] # split_len = len(naver_docs) // ( ( len(naver_docs) - MAX_TOKEN_LIMIT ) // MAX_TOKEN_LIMIT + 2 ) + 1 # print(len(naver_docs) // split_len) # for i in range( len(naver_docs) // split_len ): # print("HERE: ", i) # if i == 0: # split = naver_docs[:split_len] # else: # split = naver_docs[i * split_len - overlap: (i + 1) * split_len] # answer, _ = get_qwen_small_answer(f"Summarize important points in a paragraph, given the information below, using only Korean language. Give me only the summary!!! \n {split}") # answers.append(answer) # print("Answers: ", answers) # naver_docs = '\n'.join(answers) # naver_time_taken += time.time() - start_tmp # print("Post chunking length: ", len(naver_docs) ) content += "\n Naver 문서: " + naver_docs ### Extracting from Perplexity ### perplexity_resp = get_perplexity_answer( message ) content += "\n Perplexity 문서: " + perplexity_resp print(">>> Starting Gemini summarization...") response = gemini_summarizer.generate_content( content, stream=True ) history[-1][1] = '' ans = "" for chunk in response: ans += chunk.text.replace("*", "") yield ans.strip() + "\n" time.sleep(0.05) print("Finished Gemini summarization") print("Time taken: ", time.time() - start_time) def set_user_response( message: str, history: list ): history.append( [message, None] ) return '', history ### Server-side code ### from fastapi import FastAPI from fastapi.responses import StreamingResponse from pydantic import BaseModel from fastapi.middleware.cors import CORSMiddleware app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=['*'], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/") async def root(): return {"message": "Hello World"} class Message(BaseModel): message: str @app.post("/chat") async def chat( message: Message ): history = [[message.message, None]] return StreamingResponse( chatFunction(history), media_type='text/event-stream' )