|
import requests |
|
import os |
|
import gradio as gr |
|
import time |
|
import heapq |
|
import re |
|
from utils import context_identifier, query, convert_ipynb_to_html |
|
from sentence_transformers import SentenceTransformer, util |
|
import nbconvert |
|
import nbformat |
|
from bs4 import BeautifulSoup |
|
from inflect import engine |
|
|
|
|
|
def user_prompt_template(user_msg:str): |
|
return f"<|user|>\n{user_msg}<|end|>\n<|assistant|>" |
|
|
|
|
|
def assistant_response_template(assistant_msg:str): |
|
return f"{assistant_msg}<|end|>\n" |
|
|
|
|
|
API_TOKEN = os.environ['HF_TOKEN'] |
|
API_URL = "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct" |
|
headers = {"Authorization": f"Bearer {API_TOKEN}"} |
|
|
|
|
|
inflect_engine = engine() |
|
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
|
|
with gr.Blocks() as demo: |
|
|
|
def chat(message,history,file_path): |
|
|
|
context = context_identifier(message,API_TOKEN) |
|
print(context) |
|
hist_cop = history.copy() |
|
|
|
try: |
|
html_code = convert_ipynb_to_html(file_path) |
|
soup = BeautifulSoup(html_code, 'html.parser') |
|
text = soup.get_text() |
|
|
|
code_data_raw = text.split('In\xa0[\xa0]:') |
|
code_data_raw2 = [cell.strip() for cell in code_data_raw if cell.strip() != 'Notebook' and len(cell.strip()) > 0] |
|
|
|
code_data_cleaned = [] |
|
for item in code_data_raw2: |
|
new_line_sequence = "\n\n\n\n\n" |
|
if new_line_sequence in item: |
|
split_items = [i.strip() for i in item.split(new_line_sequence) if len(i)>0] |
|
for j in split_items: |
|
if j != 'Notebook': |
|
code_data_cleaned.append(j) |
|
else: |
|
code_data_cleaned.append(item) |
|
|
|
|
|
indexed_cells_list = [] |
|
index_comments = [] |
|
for i in range(len(code_data_cleaned)): |
|
itxt = code_data_cleaned[i] |
|
cell_addresses = f'# Cell Number: {i+1}\n' + f'# Cell Number: {inflect_engine.number_to_words(i+1)}\n' |
|
if i+1 % 10 == 1: |
|
indexed_cells_list.append(f'# {i+1}st cell\n'+ cell_addresses + itxt) |
|
index_comments.append(f'# {i+1}st cell\n'+ cell_addresses) |
|
elif i+1 % 10 == 2: |
|
indexed_cells_list.append(f'# {i+1}nd cell\n' + cell_addresses + itxt) |
|
index_comments.append(f'# {i+1}nd cell\n' + cell_addresses) |
|
elif i+1 % 10 == 3: |
|
indexed_cells_list.append(f'# {i+1}rd cell\n' + cell_addresses + itxt) |
|
index_comments.append(f'# {i+1}rd cell\n' + cell_addresses) |
|
else: |
|
indexed_cells_list.append(f'# {i+1}th cell\n' + cell_addresses + itxt) |
|
index_comments.append(f'# {i+1}th cell\n' + cell_addresses) |
|
|
|
emb_cells = embedding_model.encode(index_comments,convert_to_tensor=True) |
|
emb_msg = embedding_model.encode(message,convert_to_tensor=True) |
|
cosine_sim_0 = util.cos_sim(emb_msg,emb_cells) |
|
|
|
top_5_cells_scores = heapq.nlargest(5,cosine_sim_0[0]) |
|
top_5_cells = [indexed_cells_list[index] for index in sorted(list(cosine_sim_0[0]).index(score) for score in top_5_cells_scores)] |
|
|
|
top_2_chats = None |
|
if hist_cop: |
|
chat_history = [user_prompt_template(item[0]) + assistant_response_template(item[1]) for item in hist_cop] |
|
emb_chat_history = embedding_model.encode(chat_history,convert_to_tensor=True) |
|
cosine_similarity_scores = util.cos_sim(emb_msg,emb_chat_history) |
|
top_2_scores = heapq.nlargest(2,cosine_similarity_scores[0]) |
|
top_2_chats = [chat_history[i] for i in sorted(list(cosine_similarity_scores[0]).index(val) for val in top_2_scores)] |
|
|
|
similar_chat_history = '' |
|
if top_2_chats: |
|
for chats in top_2_chats: |
|
|
|
similar_chat_history += chats |
|
|
|
top_5_cells_string = '\n'.join(top_5_cells) |
|
|
|
if context == 'notebook_cell_context': |
|
prompt = f""" |
|
You are a coding assistant who clarifies queries based on python. You will be given two types of context. One type of context |
|
consists of previous chat messages and the other consists of code from a jupyter notebook. Your task is to answer/explanation user |
|
query by picking relevant information from both context and coming up with the answer which explains the query. The user |
|
query is delimited by ####. |
|
|
|
previous_chat_context: |
|
{similar_chat_history} |
|
|
|
notebook_cell_context: |
|
{top_5_cells_string} |
|
|
|
#### |
|
{message} |
|
#### |
|
""" |
|
elif context == 'previous_cell_context': |
|
prompt = f""" |
|
You are a coding assistant who clarifies queries based on python. You will be given a context which consists of previous chat messages. |
|
Your task is to answer/explain user query based on the context.The user query is delimited by ####. |
|
|
|
previous_chat_context: |
|
{similar_chat_history} |
|
|
|
#### |
|
{message} |
|
#### |
|
""" |
|
|
|
except: |
|
prompt = message |
|
|
|
user_input = user_prompt_template(prompt) |
|
|
|
inp_dict = {"inputs":user_input, |
|
"parameters": {"max_new_tokens":750,"temperature":0.01}} |
|
output = query(url=API_URL,headers=headers,payload=inp_dict) |
|
|
|
try: |
|
output_text = output[0]['generated_text'] |
|
formatted_assistant_msg = output_text.replace(user_input,'').strip().removesuffix('<|end|>') |
|
except: |
|
if type(output) == dict: |
|
formatted_assistant_msg = f"Error has occured, type of output is {type(output)} and items of output are: {output.items()}" |
|
else: |
|
formatted_assistant_msg = f"Error has occured, type of output is {type(output)} and length of output is: {len(output)}" |
|
|
|
output = '' |
|
for char in formatted_assistant_msg: |
|
output += char |
|
time.sleep(0.05) |
|
yield output |
|
|
|
file = gr.File(interactive=True,container=False) |
|
chatbot = gr.ChatInterface(fn=chat,fill_height=False,additional_inputs=[file],stop_btn='Stop Generation', |
|
description="[Read the Instructions here!](https://huggingface.co/spaces/noorulamean444/clma_try/blob/main/README.md)") |
|
|
|
demo.launch() |
|
|