Adr740 commited on
Commit
f798440
·
verified ·
1 Parent(s): 8c257fb

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +219 -0
  2. evaluator.py +57 -0
  3. hogwarts.py +107 -0
  4. questions.py +40 -0
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from functools import partial
4
+ from config import file_id_htl_biotech, file_id_kamera_express, file_id_smart_sd, file_id_sunday_naturals
5
+ import gdown
6
+ import pandas as pd
7
+ from hogwarts import get_answer
8
+ from evaluator import eval_answer
9
+
10
+ dico = {file_id_htl_biotech : {"name" : "htl-biotechnology", "data" : None},
11
+ file_id_smart_sd : {"name" : "smart-sd", "data" : None},
12
+ file_id_kamera_express : {"name" : "kamera-express", "data" : None},
13
+ file_id_sunday_naturals : {"name" : "sunday-naturals", "data" : None}, }
14
+
15
+ choices = ["htl-biotechnology",
16
+ "smart-sd",
17
+ "kamera-express",
18
+ "sunday-naturals"]
19
+
20
+ title = "AI4PE - Olivier and Adam \n contact: adamrida.ra@gmail.com or sp.olivier@hotmail.com"
21
+
22
+
23
+ for file_id in dico:
24
+ print("GOING FOR ", dico[file_id]["name"])
25
+ download_url = f'https://drive.google.com/uc?id={file_id}'
26
+
27
+ # Download the file using gdown
28
+ output = 'downloaded_file.csv'
29
+ gdown.download(download_url, output, quiet=False)
30
+
31
+ # Read the CSV file into a DataFrame
32
+ df = pd.read_csv(output, sep=";")[["content", "embeddings"]].replace("transcript_", "expert_meeting_notes_")
33
+ dico[file_id]["data"] = df
34
+
35
+ id_to_name_mapper = {
36
+ file_id_htl_biotech : 'htl-biotechnology',
37
+ file_id_smart_sd : 'smart-sd',
38
+ file_id_kamera_express : 'kamera-express',
39
+ file_id_sunday_naturals : 'sunday-naturals',
40
+ }
41
+ name_to_id_mapper = {
42
+ 'htl-biotechnology': file_id_htl_biotech,
43
+ 'smart-sd': file_id_smart_sd,
44
+ 'kamera-express': file_id_kamera_express,
45
+ 'sunday-naturals': file_id_sunday_naturals,
46
+ }
47
+
48
+ def get_list_files(company, dico=dico, name_to_id_mapper=name_to_id_mapper):
49
+ pdfs = []
50
+ web_pages = []
51
+ transcript = []
52
+ for ext in dico[name_to_id_mapper[company]]["data"].content.values:
53
+ # break
54
+ filename = ext.split("\n")[0]
55
+
56
+ if "SOURCE: COMPANY WEBSITE" in ext:
57
+ filename=filename.replace("https::", "").replace("https:", "").replace(".txt", "").replace(".com", " ").replace(".", " Page: ")
58
+ web_pages.append(filename)
59
+ if "SOURCE: PDF FILE" in ext:
60
+ # nb_pdfs += 1
61
+ filename = "SOURCE: UPLOADED PDF - " + ext.split("PATH_FILE =")[1].split("'}\"")[0].split("/pdfs/")[1].split("/png")[0]+".pdf"
62
+ pdfs.append(filename)
63
+ # break
64
+ # ext
65
+ pass
66
+ if "SOURCE: NOTES FROM EXPERT CALL" in ext:
67
+ # nb_expert_transcripts += 1
68
+ filename = ext.replace("_1 copy", "").replace("transcript ", "Note #").replace("transcript_1", "Note #2").replace("transcript", "Note #1").replace(".txt", "").split("\n")[0]
69
+ transcript.append(filename)
70
+ pass
71
+ # print(filename)
72
+ pdfs_string = "## Uploaded PDF files: \n" + "\n\n".join(list(set(pdfs)))
73
+ web_pages = "## Enriched from the web: \n" + "\n\n".join(list(set(web_pages)))
74
+ transcript = "## Uploaded notes from expert calls: \n" + "\n\n".join(list(set(transcript)))
75
+ return web_pages, pdfs_string, transcript
76
+
77
+ def get_data_room_overview(company, dico = dico,name_to_id_mapper = name_to_id_mapper):
78
+ nb_pdfs = 0
79
+ nb_expert_transcripts = 0
80
+ nb_web = 0
81
+ for ext in dico[name_to_id_mapper[company]]["data"].content.values:
82
+ if "SOURCE: COMPANY WEBSITE" in ext:
83
+ nb_web += 1
84
+ if "SOURCE: PDF FILE" in ext:
85
+ nb_pdfs += 1
86
+ if "SOURCE: NOTES FROM EXPERT CALL" in ext:
87
+ nb_expert_transcripts += 1
88
+ disp = f"""---
89
+ ### Overview of the data room
90
+ Enriched data room with: Linkedin profile and company website
91
+
92
+ Volumetry:
93
+ - {nb_pdfs} passages from PDF files
94
+ - {nb_web} passages from company website
95
+ - {nb_expert_transcripts} passages from notes of expert calls
96
+ """
97
+
98
+ sunday_naturals_web, sunday_naturals_pdfs, sunday_naturals_expert = get_list_files("sunday-naturals", dico, name_to_id_mapper)
99
+ smart_sd_web, smart_sd_pdfs, smart_sd_expert, = get_list_files("smart-sd", dico, name_to_id_mapper)
100
+ htl_biotech_web, htl_biotech_pdfs, htl_biotech_expert, = get_list_files("htl-biotechnology", dico, name_to_id_mapper)
101
+ kamera_express_web, kamera_express_pdfs, kamera_express_expert =get_list_files("kamera-express", dico, name_to_id_mapper)
102
+
103
+ return disp, sunday_naturals_web, sunday_naturals_pdfs,sunday_naturals_expert,smart_sd_web,smart_sd_pdfs,smart_sd_expert,htl_biotech_web,htl_biotech_pdfs,htl_biotech_expert,kamera_express_web,kamera_express_pdfs,kamera_express_expert
104
+
105
+
106
+ def generate_chat_answer(company_name, query):
107
+
108
+ df = dico[name_to_id_mapper[company_name]]["data"]
109
+ response = get_answer(df, 15, query)
110
+ print("=====> Evaluating answer quality...")
111
+ eval_score = eval(eval_answer(query, response))
112
+ eval_md = f"""
113
+ ### Evalation of how well the response answer the intial question
114
+
115
+ Score of **{eval_score["score"]}/5**
116
+
117
+ Rationale:
118
+
119
+ {eval_score["rationale_based_on_scoring_rules"]}
120
+ """
121
+
122
+ return response, eval_md
123
+
124
+ with gr.Blocks(title=title,theme='nota-ai/theme') as demo:
125
+ gr.Markdown(f"## {title}")
126
+
127
+ with gr.Row(equal_height=True):
128
+ with gr.Column(scale=1):
129
+ company_name = gr.Dropdown(choices=choices, label="Select company")
130
+
131
+ submit_button = gr.Button(value="Load workspace")
132
+ data_room_overview = gr.Markdown("---\n### Overview of the data room")
133
+ with gr.Column(scale=6):
134
+ with gr.Tab("Chat - Baseline"):
135
+ with gr.Row():
136
+ with gr.Column(scale=5):
137
+ chat_input = gr.Textbox(placeholder="Chat input", lines=2, label="Retrieve anything from the dataroom")
138
+ with gr.Column(scale=1):
139
+ chat_submit_button = gr.Button(value="Submit")
140
+ with gr.Accordion("Accuracy score", open=False):
141
+ evaluator = gr.Markdown("Waiting for answer to evaluate...")
142
+ chat_output = gr.Markdown("Waiting for question...")
143
+ with gr.Tab("Chat - ICL", interactive=False):
144
+ with gr.Column(scale=5):
145
+ chat_input_gemini = gr.Textbox(placeholder="Chat input", lines=2, label="Retrieve anything from the dataroom")
146
+ with gr.Column(scale=1):
147
+ chat_submit_button_gemini = gr.Button(value="Submit")
148
+ chat_output_gemini = gr.Markdown("Waiting for question...")
149
+ with gr.Tab("Data", interactive = True):
150
+ with gr.Tab("Sunday Naturals"):
151
+ with gr.Row():
152
+ with gr.Column():
153
+ sunday_naturals_web = gr.Markdown("Sources obtained from website")
154
+ with gr.Column():
155
+ sunday_naturals_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
156
+ # with gr.Column():
157
+ sunday_naturals_expert = gr.Markdown("Sources obtained from expert call notes")
158
+ pass
159
+ with gr.Tab("Smart SD"):
160
+ with gr.Row():
161
+ with gr.Column():
162
+ smart_sd_web = gr.Markdown("Sources obtained from website")
163
+ with gr.Column():
164
+ smart_sd_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
165
+ # with gr.Column():
166
+ smart_sd_expert = gr.Markdown("Sources obtained from expert call notes")
167
+ pass
168
+ with gr.Tab("HTL Biotech"):
169
+ with gr.Row():
170
+ with gr.Column():
171
+ htl_biotech_web = gr.Markdown("Sources obtained from website")
172
+ with gr.Column():
173
+ htl_biotech_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
174
+ # with gr.Column():
175
+ htl_biotech_expert = gr.Markdown("Sources obtained from expert call notes")
176
+ pass
177
+ with gr.Tab("Kamera Express"):
178
+ with gr.Row():
179
+ with gr.Column():
180
+ kamera_express_web = gr.Markdown("Sources obtained from website")
181
+ with gr.Column():
182
+ kamera_express_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
183
+ # with gr.Column():
184
+ kamera_express_expert = gr.Markdown("Sources obtained from expert call notes")
185
+ pass
186
+ with gr.Tab("Benchmark", interactive=False):
187
+ pass
188
+
189
+ fn = partial(get_data_room_overview)
190
+ fn_chat = partial(generate_chat_answer)
191
+ # fn_chat_gemini = partial(generate_chat_answer_gemini)
192
+ submit_button.click(fn=fn, inputs=[company_name], outputs=[
193
+ data_room_overview,
194
+ sunday_naturals_web,
195
+ sunday_naturals_pdfs,
196
+ sunday_naturals_expert,
197
+ smart_sd_web,
198
+ smart_sd_pdfs,
199
+ smart_sd_expert,
200
+ htl_biotech_web,
201
+ htl_biotech_pdfs,
202
+ htl_biotech_expert,
203
+ kamera_express_web,
204
+ kamera_express_pdfs,
205
+ kamera_express_expert])
206
+
207
+
208
+ chat_submit_button.click(fn=fn_chat, inputs=[company_name, chat_input], outputs=[chat_output, evaluator])
209
+ # chat_submit_button_gemini.click(fn=fn_chat, inputs=[company_name, chat_input_gemini], outputs=[chat_output_gemini])
210
+ login = os.environ.get("login")
211
+ pwd = os.environ.get("pwd")
212
+
213
+ demo.launch(max_threads=40, max_file_size="100mb",auth=(login, pwd))
214
+ # demo.launch(max_threads=40, max_file_size="100mb")
215
+
216
+
217
+
218
+
219
+
evaluator.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+
3
+ from config import openai_api
4
+
5
+ client = OpenAI(api_key=openai_api)
6
+ def eval_answer(ANSWER_REFERENCE, ANSWER_TO_SCORE):
7
+ system_prompt = f"""Your task is to evaluate how well a given answer fits with the following expected output (all sources and references should back up the given answers):
8
+ ====================
9
+ EXPECTED OUTPUT
10
+ {ANSWER_REFERENCE}
11
+ =====================
12
+ You only output a float score between 1 and 5 with the following scale (sources, where information was used to answer, is a key critical expected element):
13
+
14
+ 1 : out of topic, answer doesn't make sense
15
+ 2 : misleading or false answer.
16
+ 3: the answer makes sense but some parts of what is expected are missing or sources are missing
17
+ 4: very good answer backed up by all valid sources, all key elements are present. Could be more clear
18
+ 5: Perfect answer, nothing else was expected
19
+
20
+ You output the score in the following json format:
21
+ {{"score" : X, "rationale_based_on_scoring_rules" : "XXX"}}
22
+ """
23
+ user_prompt = f"""
24
+ Given answer:
25
+ {ANSWER_TO_SCORE}
26
+ """
27
+ response = client.chat.completions.create(
28
+ model="gpt-4o",
29
+ messages=[
30
+ {
31
+ "role": "system",
32
+ "content": [
33
+ {
34
+ "type": "text",
35
+ "text": system_prompt
36
+ }
37
+ ]
38
+ },
39
+ {
40
+ "role": "user",
41
+ "content": [
42
+ {
43
+ "type": "text",
44
+ "text": user_prompt
45
+ }
46
+ ]
47
+ }
48
+ ],
49
+ temperature=0,
50
+ max_tokens=256,
51
+ top_p=1,
52
+ frequency_penalty=0,
53
+ presence_penalty=0,
54
+ response_format={"type": "json_object"},
55
+ stream=False
56
+ ).choices[0].message.content
57
+ return response
hogwarts.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from openai import OpenAI
3
+ from groq import Groq
4
+ import pandas as pd
5
+
6
+ from config import openai_api, groq_api, models
7
+
8
+ provider = "openai"
9
+ if provider == "openai":
10
+ client = OpenAI(api_key=openai_api)
11
+ else:
12
+ client = Groq(api_key=groq_api)
13
+
14
+ system_prompt_default = """
15
+ """
16
+
17
+ def cosine_similarity(a, b):
18
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
19
+
20
+ def _get_embedding(text, model="text-embedding-3-large"):
21
+ try:
22
+ text = text.replace("\n", " ")
23
+ except:
24
+ None
25
+ return client.embeddings.create(input = [text], model=model).data[0].embedding
26
+
27
+ def get_answer(df, nb_in_context = 10, task = "Your task is to estimate the revenue evolution of the considered company."):
28
+ # query = str(query_preprocessor_augment(task))
29
+ embedding_query = _get_embedding(task, model="text-embedding-3-large")
30
+ try:
31
+ df['similarity'] = df.embeddings.apply(lambda x: cosine_similarity(x, embedding_query))
32
+ except:
33
+ df['similarity'] = df.embeddings.apply(lambda x: cosine_similarity(eval(x), embedding_query))
34
+ res = df.sort_values('similarity', ascending=False).head(nb_in_context).content.values
35
+
36
+ response = client.chat.completions.create(
37
+ model="gpt-4o",
38
+ messages=[
39
+ {
40
+ "role": "system",
41
+ "content": [
42
+ {
43
+ "type": "text",
44
+ "text": f"""You will be given a vast amount of data, each with it's source. {task}
45
+ Your answer, should be crisp, sharp and pinpoint to an exact source from the context (if the references of the sources are not easy to read by humans feel free to adjust so that it's readable - no links though, you refer to them as what they are).
46
+ You write using Bain's style as it will be read by private equity professionals and if asked, you never refer to yourself.
47
+ """
48
+ }
49
+ ]
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {
55
+ "type": "text",
56
+ "text": f"Context:\n{str(res)}"
57
+ }
58
+ ]
59
+ }
60
+ ],
61
+ temperature=1,
62
+ max_tokens=1665,
63
+ top_p=1,
64
+ frequency_penalty=0,
65
+ presence_penalty=0
66
+ ).choices[0].message.content
67
+ return response
68
+ # for r in res:
69
+ # print(r)
70
+
71
+ def generate_content(input, data_dumpster, chunked_raw_content, custom_query = (False, "")):
72
+ data_locker_folder = f"./{data_dumpster}/{input.replace(' ','-')}"
73
+
74
+ try:
75
+ df = pd.read_csv(f"{data_locker_folder}/vectorized_data_dumpster.csv", sep=";")
76
+ except:
77
+ df = pd.DataFrame()
78
+ df["content"] = chunked_raw_content
79
+ embeddings = []
80
+ for chunk in chunked_raw_content:
81
+ embeddings.append(_get_embedding(chunk))
82
+ df["embeddings"] = embeddings
83
+ df.to_csv(f"{data_locker_folder}/vectorized_data_dumpster.csv", sep=";")
84
+ finance = "Your task is to estimate the revenue evolution of the considered company."
85
+ product = "Your task is to give an overview of the line of products of the considered company."
86
+ customer = "Your task is to give the probable customer segmentation of the considered company."
87
+
88
+ in_context = 15
89
+ if custom_query[0]:
90
+ print("Generating custom chat output")
91
+ custom_answer = get_answer(df, in_context, custom_query[1])
92
+ return custom_answer
93
+
94
+ print("Generating financials content")
95
+ finance_content = get_answer(df, in_context, finance)
96
+ print("Generating product content")
97
+ product_content = get_answer(df, in_context, product)
98
+ print("Generating customer segmentation content")
99
+ customer_content = get_answer(df, in_context, customer)
100
+ print("Done!")
101
+
102
+ rag_content = {
103
+ "finance" : finance_content,
104
+ "product": product_content,
105
+ "customer_segmentation": customer_content
106
+ }
107
+ return rag_content
questions.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ questions = ["""What does the company do?
2
+ 2 line summary of companies activities (business model, sector, ...)
3
+ Deepdive overview of products and Services company offers (high level, just 5 or 6 points with maybe 1 or 2 examples)
4
+ Customer overview, which sectors or type of clients does it works with
5
+ What are the different business units
6
+ What is their unique selling point (e.g., price, customer connection, scale, specialisation, ...)
7
+ """
8
+ ,
9
+ """
10
+ Give me an overview of the history of the company?
11
+ When is it founded
12
+ Where is it founded
13
+ What M&A happened when (add size (can be revenue, EBITDA or FTEs, product/service/customer/geo focus)
14
+ Change of leadership
15
+ Change of ownership
16
+ """
17
+ ,
18
+ """
19
+ How do the financials of the company look like?
20
+ Revenue, COGS, Gross margin, EBITDA, Debt over the last 5 years - Add 5 year CAGR -> table or graph format
21
+ Explanation of large changes in financials -> debt uptake, ownership change, M&A, Covid, Financial down turn, ...
22
+ Make sure it's for all legal entities across the different countries
23
+ """
24
+ ,
25
+ """
26
+ Give me an overview of the products/services of the company:
27
+ Detailed overview of the different products and services, needs to be bucketed n max 5-7 elements but can be refined in the explanation of every bucket
28
+ Ideally you indicate which type of customer they serve with this product
29
+ Ideally you indicate where the most revenue comes from.
30
+ If not, just highlight the one which is mentioned most or put first in lists as most important one
31
+ Ideally you clarify with a picture of the product
32
+ """
33
+ ,
34
+ """
35
+ Give me an overview of the revenue split in geo/Product/service/End-market
36
+ """
37
+ ,
38
+ """
39
+ What number of SKUs does the company have?
40
+ Output is a number and ideally also indicated by subsegment"""]