Course_rec / app /app.py
Tao Wu
add skills query
6a029cd
raw
history blame
10.6 kB
import gradio as gr
import pandas as pd
import redis
import json
import requests
from config import *
import functools
from embedding_setup import retriever, find_similar_occupation, compare_docs_with_context,generate_exp,generate_prompt_exp
from data_process import get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
with open('/app/data/redis_data.json', 'r') as file:
data_dict = json.load(file)
#r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True)
skill_details_mapping = {}
# Function to retrieve documents based on selected skills
def retrieve_documents(occupation,skills):
output = []
output.append(f"<div style=\"text-align: center; font-size: 24px;\">Empfehlungsergebnisse:</div>")
oc_uri = occupations.get(occupation, "")
skill_query = ''
candidate_docs = []
if isinstance(oc_uri, int):
df = pd.read_csv("/app/data/berufe_info.csv")
target_occupation = df[df['id'] == oc_uri]
target_occupation_name = target_occupation['short name'].values[0]
target_occupation_dsp = target_occupation['description'].values[0]
target_occupation_query = target_occupation_name + ' ' + target_occupation_dsp
target_occupation_query = target_occupation_query
else:
target_occupation = get_occupation_detial(oc_uri)
target_occupation_name, target_occupation_dsp, target_occupation_query = build_occupation_query(target_occupation)
for german_label in skills:
skill_query += german_label + ' '
ocsk_query = target_occupation_name + ' ' + german_label
skills_docs = retriever.get_relevant_documents(german_label)
candidate_docs.extend(skills_docs[:2])
query = 'target occupation: ' + target_occupation_query + ' Skills gap:' + skill_query
llama_query = 'info:' + target_occupation_name + ' ' + 'Skills gap:' + skill_query
print(query)
docs = retriever.get_relevant_documents(query)
candidate_docs.extend(docs[:5])
#remove duplicates
seen_course_ids = set()
candidate_doc_unique = []
for doc in candidate_docs:
course_id = doc.metadata.get('id','')
if course_id not in seen_course_ids:
candidate_doc_unique.append(doc)
seen_course_ids.add(course_id)
partial_compare_docs = functools.partial(compare_docs_with_context, target_occupation_name=target_occupation_name, target_occupation_dsp=target_occupation_dsp,skill_gap = skill_query)
sorted_docs = sorted(candidate_doc_unique, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
batch_prompts = []
for doc in sorted_docs[:5]:
doc_name = doc.metadata.get('name', 'Unnamed Document')
doc_skill = doc.metadata.get('skills', '')
input_text = f"target occupation: {llama_query}\n Recommended course: name: {doc_name}, learning objectives: {doc_skill[:2000]}"
prompt = generate_prompt_exp(input_text)
batch_prompts.append(prompt)
# Evaluate the current batch of prompts
batch_output = generate_exp(batch_prompts)
output.append(f"<b>Zielberuf:</b> {target_occupation_name}")
output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
output.append(f"<b>Empfohlene Kurse:</b>")
for i in range(5):
doc = sorted_docs[i]
doc_name = doc.metadata.get('name', 'Unnamed Document')
doc_url = doc.metadata.get('url', '#')
doc_skill = doc.metadata.get('skills', '')
output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
output.append(f"<b>Empfehlungsgrund:</b> {batch_output[i]}")
output.append(f"<br>")
return "<br>".join(output)
def get_candidate_courses(occupation, skills):
output = []
output.append(f"<div style=\"text-align: center; font-size: 24px;\">Empfehlungsergebnisse:</div>")
df_lookup = pd.read_csv('/app/data/kldb_isco_lookup.csv')
df_berufe = pd.read_csv('/app/data/berufe_info.csv')
occupation_codes = set()
kldB_set = set()
occupation_hrefs = set()
BA_berufe = set()
oc_uri = occupations.get(occupation, "")
target_occupation = get_occupation_detial(oc_uri)
target_occupation_query = build_occupation_query(target_occupation)
for german_label in skills:
skill = skill_details_mapping.get(german_label, {})
uri = f'https://ec.europa.eu/esco/api/resource/skill?selectedVersion=v1.0.9&language=en&uri={skill["uri"]}'
try:
skill_response = requests.get(uri)
skill_response.raise_for_status()
skill_json = skill_response.json()
# Combine essential and optional occupations
skill_related_occupations = (skill_json['_links'].get('isEssentialForOccupation', []) +
skill_json['_links'].get('isOptionalForOccupation', []))
for occupation in skill_related_occupations:
href = occupation.get('href')
if href:
occupation_hrefs.add(href)
except requests.RequestException as e:
print(f"Error while fetching skill details: {e}")
for href in occupation_hrefs:
try:
occupation_response = requests.get(href)
occupation_response.raise_for_status()
occupation_details = occupation_response.json()
code = occupation_details.get('code')
if code:
occupation_codes.add(code.split('.')[0])
except requests.RequestException as e:
print(f"Error while fetching occupation details: {e}")
for isco_code in occupation_codes:
kldB_codes = df_lookup[df_lookup['isco08'] == int(isco_code)]['kldb2010'].values
for code in kldB_codes:
kldB_set.add(str(code))
dfs = []
for kldb in kldB_set:
berufe = df_berufe[df_berufe['KldB codes']=='B '+kldb]
dfs.append(berufe)
merged_df = pd.concat(dfs, ignore_index=True)
top_k_berufe = find_similar_occupation(target_occupation_query,merged_df,5,'cosine')
for beruf in top_k_berufe:
entry_requirement = beruf.metadata['entry_requirements']
corrected_json_string = entry_requirement.replace("'", '"')
entry_requirement_json = json.loads(corrected_json_string)
for js in entry_requirement_json:
BA_berufe.add(str(js['data_idref']))
result = get_courses_from_BA(BA_berufe)
courses = result
for course in courses['_embedded']['termine']:
output.append(f"<a href='{course['angebot']['link']}' target='_blank'>{course['angebot']['titel']}</a>")
return "<br>".join(output)
def get_occupation_skills(oc_uri):
#skills_json = r.get(oc_uri)
skills_json = data_dict.get(oc_uri, None)
skill_labels = []
if skills_json:
skills = json.loads(skills_json)
for skill in skills:
german_label = skill['preferredLabel']['de']
skill_details_mapping[german_label] = skill
skill_labels.append(german_label)
return skill_labels
else:
return skill_labels
def get_occupation_skills_BA(oc_uri):
df = pd.read_csv("/app/data/berufe_info.csv")
essential_skills = df[df['id'] == oc_uri]['essential skills'].values
optional_skills = df[df['id'] == oc_uri]['optional skills'].values
combined_skills = essential_skills[0][:-1] + ',' + optional_skills[0][1:]
combined_skills = combined_skills.replace("'", "\"")
skills = json.loads(combined_skills)
skill_labels = []
for skill in skills:
german_label = skill['skill']
skill_details_mapping[german_label] = skill
skill_labels.append(german_label)
return skill_labels
# Function to update the skills dropdown
def update_skills(occupation):
oc_uri = occupations.get(occupation, "")
if isinstance(oc_uri, int):
skills = get_occupation_skills_BA(oc_uri)
return gr.Dropdown(skills,label="aktuelle Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie derzeit besitzen')
else:
skills = get_occupation_skills(oc_uri)
return gr.Dropdown(skills,label="aktuelle Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie derzeit besitzen')
return
def update_skillgap(occupation, current_skills):
oc_uri = occupations.get(occupation, "")
if isinstance(oc_uri, int):
ocupation_skills = get_occupation_skills_BA(oc_uri)
else:
ocupation_skills = get_occupation_skills(oc_uri)
skill_gap = [skill for skill in ocupation_skills if skill not in current_skills]
return gr.Dropdown(skill_gap, label="Qualifikationslücke", multiselect=True, info='Bitte wählen Sie die Fähigkeiten aus, die Sie lernen möchten.')
if __name__ == "__main__":
# Load occupations from CSV
occupations_esco = get_occupations_from_csv(CSV_FILE_PATH)
df = pd.read_csv("/app/data/berufe_info.csv")
occupations_BA = df[['short name', 'id']].set_index('short name').to_dict()['id']
occupations = {**occupations_esco, **occupations_BA}
# Gradio interface
with gr.Blocks(title="MyEduLife Kursempfehlungssystem") as demo:
occupation_dropdown = gr.Dropdown(list(occupations.keys()), label="Zielberuf",info='Bitte wählen Sie Ihren Zielberuf aus.')
currentskill_dropdown = gr.Dropdown([],label="aktuelle Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie derzeit besitzen')
sb_btn = gr.Button("Absenden")
skillgap_dropdown = gr.Dropdown([],label="Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie lernen möchten.')
# Use gr.HTML to display the HTML content
button = gr.Button("Kursempfehlungen")
documents_output = gr.HTML()
occupation_dropdown.change(update_skills, inputs=occupation_dropdown, outputs=currentskill_dropdown)
sb_btn.click(
update_skillgap,
inputs=[occupation_dropdown,currentskill_dropdown],
outputs=skillgap_dropdown
)
button.click(
retrieve_documents,
inputs=[occupation_dropdown,skillgap_dropdown],
outputs=documents_output
)
print('Initialization completed')
demo.launch(server_name="0.0.0.0", server_port=7860)