File size: 2,836 Bytes
06b126d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import numpy as np
import pandas as pd
from openai import OpenAI
import config 

client = OpenAI(api_key=)

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def _get_embedding(text, model="text-embedding-3-large"):
    try:
        text = text.replace("\n", " ")
    except:
        None
    return client.embeddings.create(input = [text], model=model).data[0].embedding



def augment_user_input(user_input):

    prompt = f"""
    Based on the profile of this student, propose a highly detailed bullet point list of training programs in French that could be good for him:
    
    {user_input} 
    """
    augmented_input = client.chat.completions.create(
            model="gpt-4-turbo-preview",
            temperature=1,
            max_tokens = 400,
            messages=[
                    {"role": "user", "content": prompt},
                ],
            ).choices[0].message.content
    return f"{user_input}\n{augmented_input}"

def search_programs(raw_input,nb_programs_to_display=10,augment_input = False, filters = [], path_to_csv = "data_planeta_february2024.csv",):
    user_input = raw_input
    if augment_input:
        user_input = augment_user_input(raw_input)
    df = pd.read_csv(path_to_csv).dropna(subset=["Embeddings"])
    if len(filters) != 0:
        formatted_filters = []
        for filter in filters:
            formatted_filters.append(f"\nÉCOLE: {filter}")
        df = df[df["ÉCOLE"].isin(formatted_filters)].reset_index(drop=True).copy()
    try:
        df["embeddings"] = df.Embeddings.apply(lambda x: x["Embeddings"])
    except:
        pass
    try:
        df["embeddings"] = df.Embeddings.apply(lambda x: np.array(eval(x)))
    except:
        pass
    embedding = _get_embedding(user_input, model="text-embedding-3-large")
    def wrap_cos(x,y):
        try:
            res = cosine_similarity(x,y)
        except:
            res = 0
        return res
    try:
        df['similarity'] = df.Embeddings.apply(lambda x: wrap_cos(eval(x), embedding))
    except:
        breakpoint()
    results = df.sort_values('similarity', ascending=False).head(int(nb_programs_to_display)).to_dict(orient="records")
    final_string = ""
    i = 1
    for result in results:
        content = str(result["summary_french"])
        extracted_string_program = ""
        extracted_string_program += content.split("##")[1].split("\n\n")[0]

        for sub_element in content.split("##")[2:]:
            extracted_string_program += sub_element
            
        extracted_string_program=extracted_string_program.replace("\n# ", "\n### ").replace("55555","###")
            
        displayed_string = "##"+extracted_string_program + "\n\n------\n\n"
        
        final_string += displayed_string
        i += 1
        
    return final_string