File size: 6,038 Bytes
d721044
 
 
e9acf3d
11f40af
6ce5e37
d721044
03820d9
42aa5be
4442e6f
 
 
 
 
a4cba59
2f3fa8d
c0e413c
1a7ef67
 
 
 
ef0ea8d
4442e6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4cba59
4442e6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db7c1dc
 
 
 
4442e6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868d663
f55eb91
c144c76
868d663
fefa610
 
 
c144c76
 
 
4a6bbfc
 
db7c1dc
3928240
de0d883
cd64fe3
0fbe1e9
b67f2fd
8939a92
 
b67f2fd
8939a92
018ed79
b67f2fd
 
 
b2497af
e86b67a
3928240
868d663
 
d721044
4442e6f
d721044
1a7ef67
 
fb36de7
d721044
1a7ef67
 
2378020
 
 
f9923bb
d915fb9
a5545f5
 
40348d6
d721044
f5c721d
 
67392b4
 
 
 
ce73694
973ae0a
ce73694
d721044
528a2e0
d721044
 
 
 
b81b3b2
fa1564c
d721044
620ec62
d721044
e639f14
d721044
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import gradio as gr
from wordcloud import WordCloud
import pandas as pd
import requests
import json
import hopsworks
import matplotlib.pyplot as plt
import os 
import time

MODEL = "gpt-3.5-turbo"
API_URL = "https://api.openai.com/v1/chat/completions"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


project = hopsworks.login(project="ServerlessIntroIris")
fs = project.get_feature_store()
dataset1 = fs.get_feature_group(name="daily_topic_info").read()
df = dataset1
dataset2 = fs.get_feature_group(name="daily_document_info").read()
df2 = dataset2
topics = df['topic'].unique()

def gpt_predict(inputs, request:gr.Request=gr.State([]), top_p = 1, temperature = 1, chat_counter = 0,history =[]):
    payload = {
        "model": MODEL,
        "messages": [{"role": "user", "content": f"{inputs}"}],
        "temperature" : 1.0,
        "top_p":1.0,
        "n" : 1,
        "stream": True,
        "presence_penalty":0,
        "frequency_penalty":0,
    }

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API_KEY}",
    }

    # print(f"chat_counter - {chat_counter}")
    if chat_counter != 0 :
        messages = []
        for i, data in enumerate(history):
            if i % 2 == 0:
                role = 'user'
            else:
                role = 'assistant'
            message = {}
            message["role"] = role
            message["content"] = data
            messages.append(message)
        
        message = conversation_history
        messages.append(message)
        payload = {
            "model": MODEL,
            "messages": messages,
            "temperature" : temperature,
            "top_p": top_p,
            "n" : 1,
            "stream": True,
            "presence_penalty":0,
            "frequency_penalty":0,
        }

    chat_counter += 1

    history.append(inputs)
    token_counter = 0 
    partial_words = "" 
    counter = 0

    try:
        # make a POST request to the API endpoint using the requests.post method, passing in stream=True
        response = requests.post(API_URL, headers=headers, json=payload, stream=True)
        response_code = f"{response}"
        
        if response_code.strip() != "<Response [200]>":
            #print(f"response code - {response}")
            raise Exception(f"Sorry, hitting rate limit. Please try again later. {response}")
        out = []
        for chunk in response.iter_lines():
            #Skipping first chunk
            if counter == 0:
                counter += 1
                continue
                #counter+=1
            # check whether each line is non-empty
            if chunk.decode() :
                chunk = chunk.decode()
                # decode each line as response data is in bytes
                if len(chunk) > 12 and "content" in json.loads(chunk[6:])['choices'][0]['delta']:
                    partial_words = partial_words + json.loads(chunk[6:])['choices'][0]["delta"]["content"]
                    if token_counter == 0:
                        history.append(" " + partial_words)
                    else:
                        history[-1] = partial_words
                    token_counter += 1
    except Exception as e:
        print (f'error found: {e}')
    return partial_words

readable_topics_dic = dict()
input = "I have lists of multiple words : " 
mrk = [] #most representative keyword (used if chatgpt doesn't work)
for t in topics.tolist():
    if t != -1:
        selected_data = df[df['topic'] == t]
        keywords = selected_data['keywords'][selected_data.index[0]]
        freq = selected_data["scores"][selected_data.index[0]]
        keyword_freq_pairs = zip(keywords, freq)
        most_frequent_keyword = max(keyword_freq_pairs, key=lambda x: x[1])
        print(most_frequent_keyword[0].capitalize())
        mrk.append(most_frequent_keyword[0].capitalize())
        input += ", [" + ", ".join(keywords) + "]"


input += "  I want you to give me only one precise word that best describes the theme of this list. If I give you multiple lists, I want you to give me one word with a maj in front for each of those lists, and separate them by // (your answer should contain only one word for each, if I give you 100 lists, You give me 100 words)"
new_topics = "".join(gpt_predict(input)) 
nt = new_topics.split("//")

#in case chatgpt overloaded
i = 0
if len(nt) < len(topics.tolist()):
    nt = [f"Topic {o+1}: {mrk[o]}" for o in range(len(topics.tolist())-1)]
for t in topics.tolist():
    if t != -1:
        readable_topics_dic[nt[i]] = t
        if i < len(nt)-1:
            i += 1



def display_topics(topic):
    topic = readable_topics_dic[topic]
    # Filter DataFrame based on the selected topic
    selected_data = df[df['topic'] == topic]
    selected_data2 = df2[df2['topic'] == topic]
    selected_data2 = selected_data2.sort_values(by='probability')
    # Display relevant articles
    articles = selected_data2['title'] 
    links = selected_data2['link']
    nb_art = min(4, len(links))
    articles_ret = """## Most relevant articles  
      
    """
    for i in range(nb_art):
        ind = articles.index[i]
        articles_ret += f""" * [{articles[ind]}]({links[ind]})  
    """
    # Generate word cloud for keywords
    keywords = selected_data['keywords'][selected_data.index[0]]
    freq = selected_data["scores"][selected_data.index[0]]
    keywords_wordcloud = dict()
    for i, elem in enumerate(keywords):
        keywords_wordcloud[elem] = freq[i]
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(keywords_wordcloud)
    fig, ax = plt.subplots()
    plt.axis("off")
    ax =plt.imshow(wordcloud, interpolation='bilinear')

    return articles_ret ,  fig

# Define Gradio interface
iface = gr.Interface(
    fn=display_topics,
    inputs=gr.Dropdown(nt, label="Topic"),
    outputs=[gr.Markdown(label="Most relevant articles"),gr.Plot(label="Main Keywords")],
    live=True,
    examples=[]
)
 
# Launch the app
iface.launch()