Spaces:

Adr740
/

AI4PE-STATIC

Runtime error

App Files Files Community

Adr740 commited on May 26, 2024

Commit

f798440

verified ·

1 Parent(s): 8c257fb

Upload 4 files

Browse files

Files changed (4) hide show

app.py +219 -0
evaluator.py +57 -0
hogwarts.py +107 -0
questions.py +40 -0

app.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import gradio as gr
+import os
+from functools import partial
+from config import file_id_htl_biotech, file_id_kamera_express, file_id_smart_sd, file_id_sunday_naturals
+import gdown
+import pandas as pd
+from hogwarts import get_answer
+from evaluator import eval_answer
+dico = {file_id_htl_biotech : {"name" : "htl-biotechnology", "data" : None},
+file_id_smart_sd : {"name" : "smart-sd", "data" : None},
+file_id_kamera_express : {"name" : "kamera-express", "data" : None},
+file_id_sunday_naturals : {"name" : "sunday-naturals", "data" : None}, }
+choices = ["htl-biotechnology",
+"smart-sd",
+"kamera-express",
+"sunday-naturals"]
+title = "AI4PE - Olivier and Adam \n contact: adamrida.ra@gmail.com or sp.olivier@hotmail.com"
+for file_id in dico:
+    print("GOING FOR ", dico[file_id]["name"])
+    download_url = f'https://drive.google.com/uc?id={file_id}'
+    # Download the file using gdown
+    output = 'downloaded_file.csv'
+    gdown.download(download_url, output, quiet=False)
+    # Read the CSV file into a DataFrame
+    df = pd.read_csv(output, sep=";")[["content", "embeddings"]].replace("transcript_", "expert_meeting_notes_")
+    dico[file_id]["data"] = df
+id_to_name_mapper = {
+file_id_htl_biotech : 'htl-biotechnology',
+file_id_smart_sd : 'smart-sd',
+file_id_kamera_express : 'kamera-express',
+file_id_sunday_naturals : 'sunday-naturals',
+}
+name_to_id_mapper = {
+'htl-biotechnology': file_id_htl_biotech,
+ 'smart-sd': file_id_smart_sd,
+ 'kamera-express': file_id_kamera_express,
+ 'sunday-naturals': file_id_sunday_naturals,
+}
+def get_list_files(company, dico=dico, name_to_id_mapper=name_to_id_mapper):
+    pdfs = []
+    web_pages = []
+    transcript = []
+    for ext in dico[name_to_id_mapper[company]]["data"].content.values:
+        # break
+        filename = ext.split("\n")[0]
+        if "SOURCE: COMPANY WEBSITE" in ext:
+            filename=filename.replace("https::", "").replace("https:", "").replace(".txt", "").replace(".com", " ").replace(".", " Page: ")
+            web_pages.append(filename)
+        if "SOURCE: PDF FILE" in ext:
+            # nb_pdfs += 1
+            filename = "SOURCE: UPLOADED PDF - " + ext.split("PATH_FILE =")[1].split("'}\"")[0].split("/pdfs/")[1].split("/png")[0]+".pdf"
+            pdfs.append(filename)
+            # break
+            # ext
+            pass
+        if "SOURCE: NOTES FROM EXPERT CALL" in ext:
+            # nb_expert_transcripts += 1
+            filename = ext.replace("_1 copy", "").replace("transcript ", "Note #").replace("transcript_1", "Note #2").replace("transcript", "Note #1").replace(".txt", "").split("\n")[0]
+            transcript.append(filename)
+            pass
+        # print(filename)
+    pdfs_string = "## Uploaded PDF files: \n" + "\n\n".join(list(set(pdfs)))
+    web_pages = "## Enriched from the web: \n" + "\n\n".join(list(set(web_pages)))
+    transcript = "## Uploaded notes from expert calls: \n" + "\n\n".join(list(set(transcript)))
+    return web_pages, pdfs_string, transcript
+def get_data_room_overview(company, dico = dico,name_to_id_mapper = name_to_id_mapper):
+    nb_pdfs = 0
+    nb_expert_transcripts = 0
+    nb_web = 0
+    for ext in dico[name_to_id_mapper[company]]["data"].content.values:
+        if "SOURCE: COMPANY WEBSITE" in ext:
+            nb_web += 1
+        if "SOURCE: PDF FILE" in ext:
+            nb_pdfs += 1
+        if "SOURCE: NOTES FROM EXPERT CALL" in ext:
+            nb_expert_transcripts += 1
+    disp = f"""---
+### Overview of the data room
+Enriched data room with: Linkedin profile and company website
+Volumetry:
+- {nb_pdfs} passages from PDF files
+- {nb_web} passages from company website
+- {nb_expert_transcripts} passages from notes of expert calls
+    """
+    sunday_naturals_web, sunday_naturals_pdfs, sunday_naturals_expert = get_list_files("sunday-naturals", dico, name_to_id_mapper)
+    smart_sd_web, smart_sd_pdfs, smart_sd_expert, = get_list_files("smart-sd", dico, name_to_id_mapper)
+    htl_biotech_web, htl_biotech_pdfs, htl_biotech_expert, = get_list_files("htl-biotechnology", dico, name_to_id_mapper)
+    kamera_express_web, kamera_express_pdfs, kamera_express_expert =get_list_files("kamera-express", dico, name_to_id_mapper)
+    return disp, sunday_naturals_web, sunday_naturals_pdfs,sunday_naturals_expert,smart_sd_web,smart_sd_pdfs,smart_sd_expert,htl_biotech_web,htl_biotech_pdfs,htl_biotech_expert,kamera_express_web,kamera_express_pdfs,kamera_express_expert
+def generate_chat_answer(company_name, query):
+    df = dico[name_to_id_mapper[company_name]]["data"]
+    response = get_answer(df, 15, query)
+    print("=====> Evaluating answer quality...")
+    eval_score = eval(eval_answer(query, response))
+    eval_md = f"""
+### Evalation of how well the response answer the intial question
+Score of **{eval_score["score"]}/5**
+Rationale:
+{eval_score["rationale_based_on_scoring_rules"]}
+"""
+    return response, eval_md
+with gr.Blocks(title=title,theme='nota-ai/theme') as demo:
+    gr.Markdown(f"## {title}")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            company_name = gr.Dropdown(choices=choices, label="Select company")
+            submit_button = gr.Button(value="Load workspace")
+            data_room_overview = gr.Markdown("---\n### Overview of the data room")
+        with gr.Column(scale=6):
+            with gr.Tab("Chat - Baseline"):
+                with gr.Row():
+                    with gr.Column(scale=5):
+                        chat_input = gr.Textbox(placeholder="Chat input", lines=2, label="Retrieve anything from the dataroom")
+                    with gr.Column(scale=1):
+                        chat_submit_button = gr.Button(value="Submit")
+                with gr.Accordion("Accuracy score", open=False):
+                    evaluator = gr.Markdown("Waiting for answer to evaluate...")
+                chat_output = gr.Markdown("Waiting for question...")
+            with gr.Tab("Chat - ICL", interactive=False):
+                with gr.Column(scale=5):
+                    chat_input_gemini = gr.Textbox(placeholder="Chat input", lines=2, label="Retrieve anything from the dataroom")
+                with gr.Column(scale=1):
+                    chat_submit_button_gemini = gr.Button(value="Submit")
+                chat_output_gemini = gr.Markdown("Waiting for question...")
+            with gr.Tab("Data", interactive = True):
+                with gr.Tab("Sunday Naturals"):
+                    with gr.Row():
+                        with gr.Column():
+                            sunday_naturals_web = gr.Markdown("Sources obtained from website")
+                        with gr.Column():
+                            sunday_naturals_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
+                        # with gr.Column():
+                            sunday_naturals_expert = gr.Markdown("Sources obtained from expert call notes")
+                    pass
+                with gr.Tab("Smart SD"):
+                    with gr.Row():
+                        with gr.Column():
+                            smart_sd_web = gr.Markdown("Sources obtained from website")
+                        with gr.Column():
+                            smart_sd_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
+                        # with gr.Column():
+                            smart_sd_expert = gr.Markdown("Sources obtained from expert call notes")
+                    pass
+                with gr.Tab("HTL Biotech"):
+                    with gr.Row():
+                        with gr.Column():
+                            htl_biotech_web = gr.Markdown("Sources obtained from website")
+                        with gr.Column():
+                            htl_biotech_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
+                        # with gr.Column():
+                            htl_biotech_expert = gr.Markdown("Sources obtained from expert call notes")
+                    pass
+                with gr.Tab("Kamera Express"):
+                    with gr.Row():
+                        with gr.Column():
+                            kamera_express_web = gr.Markdown("Sources obtained from website")
+                        with gr.Column():
+                            kamera_express_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
+                        # with gr.Column():
+                            kamera_express_expert = gr.Markdown("Sources obtained from expert call notes")
+                    pass
+            with gr.Tab("Benchmark", interactive=False):
+                pass
+    fn = partial(get_data_room_overview)
+    fn_chat = partial(generate_chat_answer)
+    # fn_chat_gemini = partial(generate_chat_answer_gemini)
+    submit_button.click(fn=fn, inputs=[company_name], outputs=[
+        data_room_overview,
+        sunday_naturals_web,
+        sunday_naturals_pdfs,
+        sunday_naturals_expert,
+        smart_sd_web,
+        smart_sd_pdfs,
+        smart_sd_expert,
+        htl_biotech_web,
+        htl_biotech_pdfs,
+        htl_biotech_expert,
+        kamera_express_web,
+        kamera_express_pdfs,
+        kamera_express_expert])
+    chat_submit_button.click(fn=fn_chat, inputs=[company_name, chat_input], outputs=[chat_output, evaluator])
+    # chat_submit_button_gemini.click(fn=fn_chat, inputs=[company_name, chat_input_gemini], outputs=[chat_output_gemini])
+login = os.environ.get("login")
+pwd = os.environ.get("pwd")
+demo.launch(max_threads=40, max_file_size="100mb",auth=(login, pwd))
+# demo.launch(max_threads=40, max_file_size="100mb")

evaluator.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from openai import OpenAI
+from config import openai_api
+client = OpenAI(api_key=openai_api)
+def eval_answer(ANSWER_REFERENCE, ANSWER_TO_SCORE):
+    system_prompt = f"""Your task is to evaluate how well a given answer fits with the following expected output (all sources and references should back up the given answers):
+            ====================
+            EXPECTED OUTPUT
+            {ANSWER_REFERENCE}
+            =====================
+            You only output a float score between 1 and 5 with the following scale (sources, where information was used to answer, is a key critical expected element):
+            1 : out of topic, answer doesn't make sense
+            2 : misleading or false answer.
+            3: the answer makes sense but some parts of what is expected are missing or sources are missing
+            4: very good answer backed up by all valid sources, all key elements are present. Could be more clear
+            5: Perfect answer, nothing else was expected
+            You output the score in the following json format:
+            {{"score" : X, "rationale_based_on_scoring_rules" : "XXX"}}
+            """
+    user_prompt = f"""
+    Given answer:
+    {ANSWER_TO_SCORE}
+    """
+    response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+        {
+        "role": "system",
+        "content": [
+            {
+            "type": "text",
+            "text": system_prompt
+            }
+        ]
+        },
+        {
+        "role": "user",
+        "content": [
+            {
+            "type": "text",
+            "text": user_prompt
+            }
+        ]
+        }
+    ],
+    temperature=0,
+    max_tokens=256,
+    top_p=1,
+    frequency_penalty=0,
+    presence_penalty=0,
+    response_format={"type": "json_object"},
+    stream=False
+    ).choices[0].message.content
+    return response

hogwarts.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import numpy as np
+from openai import OpenAI
+from groq import Groq
+import pandas as pd
+from config import openai_api, groq_api, models
+provider = "openai"
+if provider == "openai":
+    client = OpenAI(api_key=openai_api)
+else:
+    client = Groq(api_key=groq_api)
+system_prompt_default = """
+"""
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+def _get_embedding(text, model="text-embedding-3-large"):
+    try:
+        text = text.replace("\n", " ")
+    except:
+        None
+    return client.embeddings.create(input = [text], model=model).data[0].embedding
+def get_answer(df, nb_in_context = 10, task = "Your task is to estimate the revenue evolution of the considered company."):
+    # query = str(query_preprocessor_augment(task))
+    embedding_query = _get_embedding(task, model="text-embedding-3-large")
+    try:
+        df['similarity'] = df.embeddings.apply(lambda x: cosine_similarity(x, embedding_query))
+    except:
+        df['similarity'] = df.embeddings.apply(lambda x: cosine_similarity(eval(x), embedding_query))
+    res = df.sort_values('similarity', ascending=False).head(nb_in_context).content.values
+    response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+        {
+        "role": "system",
+        "content": [
+            {
+            "type": "text",
+            "text": f"""You will be given a vast amount of data, each with it's source. {task}
+            Your answer, should be crisp, sharp and pinpoint to an exact source from the context (if the references of the sources are not easy to read by humans feel free to adjust so that it's readable - no links though, you refer to them as what they are).
+            You write using Bain's style as it will be read by private equity professionals and if asked, you never refer to yourself.
+            """
+            }
+        ]
+        },
+        {
+        "role": "user",
+        "content": [
+            {
+            "type": "text",
+            "text": f"Context:\n{str(res)}"
+            }
+        ]
+        }
+    ],
+    temperature=1,
+    max_tokens=1665,
+    top_p=1,
+    frequency_penalty=0,
+    presence_penalty=0
+    ).choices[0].message.content
+    return response
+    # for r in res:
+    #     print(r)
+def generate_content(input, data_dumpster, chunked_raw_content, custom_query = (False, "")):
+    data_locker_folder = f"./{data_dumpster}/{input.replace(' ','-')}"
+    try:
+        df = pd.read_csv(f"{data_locker_folder}/vectorized_data_dumpster.csv", sep=";")
+    except:
+        df = pd.DataFrame()
+        df["content"] = chunked_raw_content
+        embeddings = []
+        for chunk in chunked_raw_content:
+            embeddings.append(_get_embedding(chunk))
+        df["embeddings"] = embeddings
+        df.to_csv(f"{data_locker_folder}/vectorized_data_dumpster.csv", sep=";")
+    finance = "Your task is to estimate the revenue evolution of the considered company."
+    product = "Your task is to give an overview of the line of products of the considered company."
+    customer = "Your task is to give the probable customer segmentation of the considered company."
+    in_context = 15
+    if custom_query[0]:
+        print("Generating custom chat output")
+        custom_answer = get_answer(df,  in_context, custom_query[1])
+        return custom_answer
+    print("Generating financials content")
+    finance_content = get_answer(df,  in_context, finance)
+    print("Generating product content")
+    product_content = get_answer(df,  in_context, product)
+    print("Generating customer segmentation content")
+    customer_content = get_answer(df,  in_context, customer)
+    print("Done!")
+    rag_content = {
+        "finance" : finance_content,
+        "product": product_content,
+        "customer_segmentation": customer_content
+    }
+    return rag_content

questions.py ADDED Viewed

	@@ -0,0 +1,40 @@

+questions = ["""What does the company do?
+2 line summary of companies activities  (business model, sector, ...)
+Deepdive overview of products and Services company offers (high level, just 5 or 6 points with maybe 1 or 2 examples)
+Customer overview, which sectors or type of clients does it works with
+What are the different business units
+What is their unique selling point (e.g., price, customer connection, scale, specialisation, ...)
+"""
+,
+"""
+Give me an overview of the history of the company?
+When is it founded
+Where is it founded
+What M&A happened when (add size (can be revenue, EBITDA or FTEs, product/service/customer/geo focus)
+Change of leadership
+Change of ownership
+"""
+,
+"""
+How do the financials of the company look like?
+Revenue, COGS, Gross margin, EBITDA, Debt over the last 5 years - Add 5 year CAGR -> table or graph format
+Explanation of large changes in financials -> debt uptake, ownership change, M&A, Covid, Financial down turn, ...
+Make sure it's for all legal entities across the different countries
+"""
+,
+"""
+Give me an overview of the products/services of the company:
+Detailed overview of the different products and services, needs to be bucketed n max 5-7 elements but can be refined in the explanation of every bucket
+Ideally you indicate which type of customer they serve with this product
+Ideally you indicate where the most revenue comes from.
+If not, just highlight the one which is mentioned most or put first in lists as most important one
+Ideally you clarify with a picture of the product
+"""
+,
+"""
+Give me an overview of the revenue split in geo/Product/service/End-market
+"""
+,
+"""
+What number of SKUs does the company have?
+Output is a number and ideally also indicated by subsegment"""]