File size: 2,853 Bytes
06b126d 40cb843 06b126d 9f89466 06b126d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import numpy as np
import pandas as pd
from openai import OpenAI
import os
client = OpenAI(api_key= os.environ.get("apk"))
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def _get_embedding(text, model="text-embedding-3-large"):
try:
text = text.replace("\n", " ")
except:
None
return client.embeddings.create(input = [text], model=model).data[0].embedding
def augment_user_input(user_input):
prompt = f"""
Based on the profile of this student, propose a highly detailed bullet point list of training programs in French that could be good for him:
{user_input}
"""
augmented_input = client.chat.completions.create(
model="gpt-4-turbo-preview",
temperature=1,
max_tokens = 400,
messages=[
{"role": "user", "content": prompt},
],
).choices[0].message.content
return f"{user_input}\n{augmented_input}"
def search_programs(raw_input,nb_programs_to_display=10,augment_input = False, filters = [], path_to_csv = "data_planeta_february2024.csv",):
user_input = raw_input
if augment_input:
user_input = augment_user_input(raw_input)
df = pd.read_csv(path_to_csv).dropna(subset=["Embeddings"])
if len(filters) != 0:
formatted_filters = []
for filter in filters:
formatted_filters.append(f"\nÉCOLE: {filter}")
df = df[df["ÉCOLE"].isin(formatted_filters)].reset_index(drop=True).copy()
try:
df["embeddings"] = df.Embeddings.apply(lambda x: x["Embeddings"])
except:
pass
try:
df["embeddings"] = df.Embeddings.apply(lambda x: np.array(eval(x)))
except:
pass
embedding = _get_embedding(user_input, model="text-embedding-3-large")
def wrap_cos(x,y):
try:
res = cosine_similarity(x,y)
except:
res = 0
return res
try:
df['similarity'] = df.Embeddings.apply(lambda x: wrap_cos(eval(x), embedding))
except:
breakpoint()
results = df.sort_values('similarity', ascending=False).head(int(nb_programs_to_display)).to_dict(orient="records")
final_string = ""
i = 1
for result in results:
content = str(result["summary_french"])
extracted_string_program = ""
extracted_string_program += content.split("##")[1].split("\n\n")[0]
for sub_element in content.split("##")[2:]:
extracted_string_program += sub_element
extracted_string_program=extracted_string_program.replace("\n# ", "\n### ").replace("55555","###")
displayed_string = "##"+extracted_string_program + "\n\n------\n\n"
final_string += displayed_string
i += 1
return final_string
|