File size: 6,806 Bytes
b1aff23
 
 
 
 
ff1b0a1
fa7065e
b1aff23
ff1b0a1
 
 
4562763
 
45cbcf8
b1aff23
 
e4ec87f
c039f17
b1aff23
 
45cbcf8
 
c039f17
 
 
45cbcf8
c039f17
 
 
60cc907
7f5825e
 
 
 
 
 
 
5b84f6f
7f5825e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa7065e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f5825e
 
fa7065e
7f5825e
 
ddca142
7f5825e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa7065e
 
6c2642c
83cb40a
7f5825e
7ec22a8
7f5825e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import requests
import os
import gradio as gr
import time
import heapq
import re
from utils import context_identifier, query, convert_ipynb_to_html
from sentence_transformers import SentenceTransformer, util
import nbconvert
import nbformat
from bs4 import BeautifulSoup
from inflect import engine


def user_prompt_template(user_msg:str):
    return f"<|user|>\n{user_msg}<|end|>\n<|assistant|>"


def assistant_response_template(assistant_msg:str):
    return f"{assistant_msg}<|end|>\n"


API_TOKEN = os.environ['HF_TOKEN']
API_URL = "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
headers = {"Authorization": f"Bearer {API_TOKEN}"}


inflect_engine = engine()
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

with gr.Blocks() as demo:

    def chat(message,history,file_path):
        
        context = context_identifier(message,API_TOKEN)
        print(context)
        hist_cop = history.copy()
        
        try:
            html_code = convert_ipynb_to_html(file_path)
            soup = BeautifulSoup(html_code, 'html.parser')
            text = soup.get_text()

            code_data_raw = text.split('In\xa0[\xa0]:')
            code_data_raw2 = [cell.strip() for cell in code_data_raw if cell.strip() != 'Notebook' and len(cell.strip()) > 0]

            code_data_cleaned = []
            for item in code_data_raw2:
                new_line_sequence = "\n\n\n\n\n"
                if new_line_sequence in item:
                    split_items = [i.strip() for i in item.split(new_line_sequence) if len(i)>0]
                    for j in split_items:
                        if j != 'Notebook':
                            code_data_cleaned.append(j)
                else:
                    code_data_cleaned.append(item)


            indexed_cells_list = []
            index_comments = []
            for i in range(len(code_data_cleaned)):
                itxt = code_data_cleaned[i]
                cell_addresses = f'# Cell Number: {i+1}\n' + f'# Cell Number: {inflect_engine.number_to_words(i+1)}\n'
                if i+1 % 10 == 1:
                    indexed_cells_list.append(f'# {i+1}st cell\n'+ cell_addresses + itxt)
                    index_comments.append(f'# {i+1}st cell\n'+ cell_addresses)
                elif i+1 % 10 == 2:
                    indexed_cells_list.append(f'# {i+1}nd cell\n' + cell_addresses + itxt)
                    index_comments.append(f'# {i+1}nd cell\n' + cell_addresses)
                elif i+1 % 10 == 3:
                    indexed_cells_list.append(f'# {i+1}rd cell\n' + cell_addresses + itxt)
                    index_comments.append(f'# {i+1}rd cell\n' + cell_addresses)
                else:
                    indexed_cells_list.append(f'# {i+1}th cell\n' + cell_addresses + itxt)
                    index_comments.append(f'# {i+1}th cell\n' + cell_addresses)

            emb_cells = embedding_model.encode(index_comments,convert_to_tensor=True)
            emb_msg = embedding_model.encode(message,convert_to_tensor=True)
            cosine_sim_0 = util.cos_sim(emb_msg,emb_cells)
    
            top_5_cells_scores = heapq.nlargest(5,cosine_sim_0[0])
            top_5_cells = [indexed_cells_list[index] for index in sorted(list(cosine_sim_0[0]).index(score) for score in top_5_cells_scores)]
    
            top_2_chats = None
            if hist_cop:
                chat_history = [user_prompt_template(item[0]) + assistant_response_template(item[1]) for item in hist_cop]
                emb_chat_history = embedding_model.encode(chat_history,convert_to_tensor=True)
                cosine_similarity_scores = util.cos_sim(emb_msg,emb_chat_history)
                top_2_scores = heapq.nlargest(2,cosine_similarity_scores[0])
                top_2_chats = [chat_history[i] for i in sorted(list(cosine_similarity_scores[0]).index(val) for val in top_2_scores)]
                
            similar_chat_history = ''
            if top_2_chats:
                for chats in top_2_chats:
                    
                    similar_chat_history += chats
    
            top_5_cells_string = '\n'.join(top_5_cells)
    
            if context == 'notebook_cell_context':
                prompt = f"""
                You are a coding assistant who clarifies queries based on python. You will be given two types of context. One type of context
                consists of previous chat messages and the other consists of code from a jupyter notebook. Your task is to answer/explanation user 
                query by picking relevant information from both context and coming up with the answer which explains the query. The user
                query is delimited by ####.
    
                previous_chat_context: 
                {similar_chat_history} 
    
                notebook_cell_context:
                {top_5_cells_string}
    
                ####
                {message}
                ####
                """
            elif context == 'previous_cell_context':
                prompt = f"""
                You are a coding assistant who clarifies queries based on python. You will be given a context which consists of previous chat messages. 
                Your task is to answer/explain user query based on the context.The user query is delimited by ####.
    
                previous_chat_context: 
                {similar_chat_history} 
    
                ####
                {message}
                ####
                """

        except:
            prompt = message

        user_input = user_prompt_template(prompt)
        
        inp_dict = {"inputs":user_input,
                    "parameters": {"max_new_tokens":750,"temperature":0.01}}
        output = query(url=API_URL,headers=headers,payload=inp_dict)

        try:
            output_text = output[0]['generated_text']
            formatted_assistant_msg = output_text.replace(user_input,'').strip().removesuffix('<|end|>')
        except:
            if type(output) == dict:
                formatted_assistant_msg = f"Error has occured, type of output is {type(output)} and items of output are: {output.items()}"
            else:
                formatted_assistant_msg = f"Error has occured, type of output is {type(output)} and length of output is: {len(output)}"

        output = ''
        for char in formatted_assistant_msg:
            output += char
            time.sleep(0.05)
            yield output
    
    file = gr.File(interactive=True,container=False)
    chatbot = gr.ChatInterface(fn=chat,fill_height=False,additional_inputs=[file],stop_btn='Stop Generation',
                               description="[Read the Instructions here!](https://huggingface.co/spaces/noorulamean444/clma_try/blob/main/README.md)")


demo.launch()