File size: 6,806 Bytes
b1aff23 ff1b0a1 fa7065e b1aff23 ff1b0a1 4562763 45cbcf8 b1aff23 e4ec87f c039f17 b1aff23 45cbcf8 c039f17 45cbcf8 c039f17 60cc907 7f5825e 5b84f6f 7f5825e fa7065e 7f5825e fa7065e 7f5825e ddca142 7f5825e fa7065e 6c2642c 83cb40a 7f5825e 7ec22a8 7f5825e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import requests
import os
import gradio as gr
import time
import heapq
import re
from utils import context_identifier, query, convert_ipynb_to_html
from sentence_transformers import SentenceTransformer, util
import nbconvert
import nbformat
from bs4 import BeautifulSoup
from inflect import engine
def user_prompt_template(user_msg:str):
return f"<|user|>\n{user_msg}<|end|>\n<|assistant|>"
def assistant_response_template(assistant_msg:str):
return f"{assistant_msg}<|end|>\n"
API_TOKEN = os.environ['HF_TOKEN']
API_URL = "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
headers = {"Authorization": f"Bearer {API_TOKEN}"}
inflect_engine = engine()
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
with gr.Blocks() as demo:
def chat(message,history,file_path):
context = context_identifier(message,API_TOKEN)
print(context)
hist_cop = history.copy()
try:
html_code = convert_ipynb_to_html(file_path)
soup = BeautifulSoup(html_code, 'html.parser')
text = soup.get_text()
code_data_raw = text.split('In\xa0[\xa0]:')
code_data_raw2 = [cell.strip() for cell in code_data_raw if cell.strip() != 'Notebook' and len(cell.strip()) > 0]
code_data_cleaned = []
for item in code_data_raw2:
new_line_sequence = "\n\n\n\n\n"
if new_line_sequence in item:
split_items = [i.strip() for i in item.split(new_line_sequence) if len(i)>0]
for j in split_items:
if j != 'Notebook':
code_data_cleaned.append(j)
else:
code_data_cleaned.append(item)
indexed_cells_list = []
index_comments = []
for i in range(len(code_data_cleaned)):
itxt = code_data_cleaned[i]
cell_addresses = f'# Cell Number: {i+1}\n' + f'# Cell Number: {inflect_engine.number_to_words(i+1)}\n'
if i+1 % 10 == 1:
indexed_cells_list.append(f'# {i+1}st cell\n'+ cell_addresses + itxt)
index_comments.append(f'# {i+1}st cell\n'+ cell_addresses)
elif i+1 % 10 == 2:
indexed_cells_list.append(f'# {i+1}nd cell\n' + cell_addresses + itxt)
index_comments.append(f'# {i+1}nd cell\n' + cell_addresses)
elif i+1 % 10 == 3:
indexed_cells_list.append(f'# {i+1}rd cell\n' + cell_addresses + itxt)
index_comments.append(f'# {i+1}rd cell\n' + cell_addresses)
else:
indexed_cells_list.append(f'# {i+1}th cell\n' + cell_addresses + itxt)
index_comments.append(f'# {i+1}th cell\n' + cell_addresses)
emb_cells = embedding_model.encode(index_comments,convert_to_tensor=True)
emb_msg = embedding_model.encode(message,convert_to_tensor=True)
cosine_sim_0 = util.cos_sim(emb_msg,emb_cells)
top_5_cells_scores = heapq.nlargest(5,cosine_sim_0[0])
top_5_cells = [indexed_cells_list[index] for index in sorted(list(cosine_sim_0[0]).index(score) for score in top_5_cells_scores)]
top_2_chats = None
if hist_cop:
chat_history = [user_prompt_template(item[0]) + assistant_response_template(item[1]) for item in hist_cop]
emb_chat_history = embedding_model.encode(chat_history,convert_to_tensor=True)
cosine_similarity_scores = util.cos_sim(emb_msg,emb_chat_history)
top_2_scores = heapq.nlargest(2,cosine_similarity_scores[0])
top_2_chats = [chat_history[i] for i in sorted(list(cosine_similarity_scores[0]).index(val) for val in top_2_scores)]
similar_chat_history = ''
if top_2_chats:
for chats in top_2_chats:
similar_chat_history += chats
top_5_cells_string = '\n'.join(top_5_cells)
if context == 'notebook_cell_context':
prompt = f"""
You are a coding assistant who clarifies queries based on python. You will be given two types of context. One type of context
consists of previous chat messages and the other consists of code from a jupyter notebook. Your task is to answer/explanation user
query by picking relevant information from both context and coming up with the answer which explains the query. The user
query is delimited by ####.
previous_chat_context:
{similar_chat_history}
notebook_cell_context:
{top_5_cells_string}
####
{message}
####
"""
elif context == 'previous_cell_context':
prompt = f"""
You are a coding assistant who clarifies queries based on python. You will be given a context which consists of previous chat messages.
Your task is to answer/explain user query based on the context.The user query is delimited by ####.
previous_chat_context:
{similar_chat_history}
####
{message}
####
"""
except:
prompt = message
user_input = user_prompt_template(prompt)
inp_dict = {"inputs":user_input,
"parameters": {"max_new_tokens":750,"temperature":0.01}}
output = query(url=API_URL,headers=headers,payload=inp_dict)
try:
output_text = output[0]['generated_text']
formatted_assistant_msg = output_text.replace(user_input,'').strip().removesuffix('<|end|>')
except:
if type(output) == dict:
formatted_assistant_msg = f"Error has occured, type of output is {type(output)} and items of output are: {output.items()}"
else:
formatted_assistant_msg = f"Error has occured, type of output is {type(output)} and length of output is: {len(output)}"
output = ''
for char in formatted_assistant_msg:
output += char
time.sleep(0.05)
yield output
file = gr.File(interactive=True,container=False)
chatbot = gr.ChatInterface(fn=chat,fill_height=False,additional_inputs=[file],stop_btn='Stop Generation',
description="[Read the Instructions here!](https://huggingface.co/spaces/noorulamean444/clma_try/blob/main/README.md)")
demo.launch()
|