File size: 6,892 Bytes
b1aff23
 
 
 
 
ff1b0a1
b1aff23
8731178
b1aff23
ff1b0a1
4562763
eb3a503
 
ff1b0a1
b1aff23
ff1b0a1
 
 
4562763
 
 
ff1b0a1
eb3a503
ff1b0a1
 
 
 
 
 
 
eb3a503
 
 
ff1b0a1
eb3a503
60cc907
b1aff23
60cc907
b1aff23
 
60cc907
b1aff23
45cbcf8
b1aff23
 
e4ec87f
b1aff23
 
45cbcf8
 
b1aff23
 
 
45cbcf8
 
60cc907
b1aff23
 
3abe5e0
 
b1aff23
 
 
 
eb3a503
a270dbe
 
 
 
b9de9a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71f88f8
b9de9a4
 
c9796b2
 
 
71f88f8
c9796b2
 
71f88f8
c9796b2
 
71f88f8
c9796b2
 
2f3887c
b9de9a4
 
 
 
 
a270dbe
 
 
3abe5e0
 
 
7919b38
b9de9a4
71f88f8
3abe5e0
 
 
 
71f88f8
 
3abe5e0
5b84f6f
b1aff23
 
 
3abe5e0
b1aff23
 
 
 
5b84f6f
b1aff23
 
 
 
 
 
 
281b6ba
71f88f8
e167d10
281b6ba
 
e4ec87f
7919b38
 
 
ddca142
b1aff23
 
 
 
 
 
 
 
 
 
 
 
12c98b4
 
ae3db05
1002a87
 
281b6ba
4546887
b1aff23
4546887
b1aff23
 
4546887
b1aff23
4546887
b1aff23
 
 
 
 
 
60cc907
b1aff23
 
 
 
 
60cc907
b1aff23
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import requests
import os
import gradio as gr
import time
import heapq
import re
from utils import package_installer

package_installer('sentence_transformers')
package_installer('nbconvert')
package_installer('inflect')
# package_installer('nbformat')
# package_installer('beautifulsoup4')

from sentence_transformers import SentenceTransformer, util
import nbconvert
import nbformat
from bs4 import BeautifulSoup
from inflect import engine

inflect_engine = engine()

def convert_ipynb_to_html(input_file):
    # Load .ipynb file into a nbformat.NotebookNode object
    notebook = nbformat.read(input_file, as_version=4)

    # Convert using HTML exporter
    html_exporter = nbconvert.HTMLExporter()
    (body, resources) = html_exporter.from_notebook_node(notebook)
    # Write to output html file
    # with open(output_file, 'w') as f:
    #     f.write(body)
        
    return body
    

API_TOKEN = os.environ['HF_TOKEN']

API_URL = "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
headers = {"Authorization": f"Bearer {API_TOKEN}"}

embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def user_prompt_template(user_msg:str):
    return f"<|user|>\n{user_msg}<|end|>\n<|assistant|>"

def assistant_response_template(assistant_msg:str):
    return f"{assistant_msg}<|end|>\n"


def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload,timeout=120)
	return response.json()

    

def chat(message,history):

    formatted_user_msg = user_prompt_template(message['text'])

    hist_cop = history.copy()
    for item in history:
        if None in item and type(item[0]) == tuple:
            hist_cop.remove(item)

    try:
        html_code = convert_ipynb_to_html(message['files'][0])
        soup = BeautifulSoup(html_code, 'html.parser')
        text = soup.get_text()

        code_data = text.split('\n')
        string = ''
        cells_list = []
        for item in code_data:
        
            if len(item) > 0:
                string += item + '\n'
                continue
        
            if len(item) == 0 and len(string) > 0:
                cells_list.append(string)
                string = ''

        cells_list_copy = cells_list.copy()
        for item in cells_list:
            if item == 'Notebook\n' or item == 'In\xa0[\xa0]:\n':
                cells_list_copy.remove(item)

        indexed_cells_list = []
        index_comments = []
        for i in range(len(cells_list_copy)):
            itxt = cells_list_copy[i]
            cell_addresses = f'# Cell Number: {i+1}\n' + f'# Cell Number: {inflect_engine.number_to_words(i+1)}\n'
            if i+1 % 10 == 1:
                indexed_cells_list.append(f'# {i+1}st cell\n'+ cell_addresses + itxt)
                index_comments.append(f'# {i+1}st cell\n'+ cell_addresses)
            elif i+1 % 10 == 2:
                indexed_cells_list.append(f'# {i+1}nd cell\n' + cell_addresses + itxt)
                index_comments.append(f'# {i+1}nd cell\n' + cell_addresses)
            elif i+1 % 10 == 3:
                indexed_cells_list.append(f'# {i+1}rd cell\n' + cell_addresses + itxt)
                index_comments.append(f'# {i+1}rd cell\n' + cell_addresses)
            else:
                indexed_cells_list.append(f'# {i+1}th cell\n' + cell_addresses + itxt)
                index_comments.append(f'# {i+1}th cell\n' + cell_addresses)
        
        
        # cells = re.split(r"In\xa0\[[0-9\xa0]*\]:",text)
        # cells = [element.strip() for element in cells]
        # cells = [element for element in cells if element != '']
    except:
        pass
        
    # print(cells)
    # print()
    # print(len(cells))

    # cells_as_string = '\n'.join(cells)
    emb_cells = embedding_model.encode(index_comments,convert_to_tensor=True)

    emb_formatted_user_msg = embedding_model.encode(formatted_user_msg,convert_to_tensor=True)
    cosine_sim_0 = util.cos_sim(emb_formatted_user_msg,emb_cells)

    top_5_cells_scores = heapq.nlargest(5,cosine_sim_0[0])
    top_5_cells = [indexed_cells_list[index] for index in sorted(list(cosine_sim_0[0]).index(score) for score in top_5_cells_scores)]
    
    
    top_2_chats = None
    if hist_cop:
        chat_history = [user_prompt_template(item[0]) + assistant_response_template(item[1]) for item in hist_cop]
        # emb_formatted_user_msg = embedding_model.encode(formatted_user_msg,convert_to_tensor=True)
        emb_chat_history = embedding_model.encode(chat_history,convert_to_tensor=True)
        cosine_similarity_scores = util.cos_sim(emb_formatted_user_msg,emb_chat_history)
        top_2_scores = heapq.nlargest(2,cosine_similarity_scores[0])
        top_2_chats = [chat_history[i] for i in sorted(list(cosine_similarity_scores[0]).index(val) for val in top_2_scores)]
        
    similar_chat_history = ''
    if top_2_chats:
        for chats in top_2_chats:
            # formatted_assistant_msg = chats[1].replace(chats[0],'').strip().removesuffix('<|end|>')
            similar_chat_history += chats

    #prompt = f"<|user|>\n{message}<|end|>\n<|assistant|>"

    top_5_cells_string = '\n'.join(top_5_cells)
    context_plus_message = top_5_cells_string  + message['text']
    formatted_context_plus_message = user_prompt_template(context_plus_message)
    user_input = similar_chat_history + formatted_context_plus_message
    
    # print(user_input)
    # print('-'*20)
    # print('\n')
        
    inp_dict = {"inputs":user_input,
                "parameters": {"max_new_tokens":750,"temperature":0.01}}
    output = query(inp_dict)
    #
    try:
        output_text = output[0]['generated_text']
        formatted_assistant_msg = output_text.replace(user_input,'').strip().removesuffix('<|end|>')
    except:
        if type(output) == dict:
            formatted_assistant_msg = f"Error has occured, type of output is {type(output)} and items of output are: {output.items()}"
        else:
            formatted_assistant_msg = f"Error has occured, type of output is {type(output)} and length of output is: {len(output)}"

    
    print(user_input)
    print()
    print(indexed_cells_list)
    return formatted_assistant_msg

demo = gr.ChatInterface(chat, multimodal=True)

if __name__ == '__main__':
    demo.launch()

# import gradio as gr

# def process_file(file_path):
#     # This function will be called when a file is uploaded.
#     # 'file_path' is a string that contains the path to the uploaded file.
#     # You can read the file using this path and process it as needed.
#     # For example, you can return the name of the file:
#     return f"You uploaded {file_path}"

# iface = gr.Interface(
#     fn=process_file,  # the function to call when a file is uploaded
#     inputs=gr.File(),  # creates a file upload button
#     outputs="text"  # the output of 'process_file' is text
# )

# iface.launch()