Spaces:

wt3639
/

Course_rec

Paused

File size: 10,540 Bytes

import gradio as gr
import pandas as pd
import redis
import json
import requests
from config import *
import functools
from embedding_setup import retriever, find_similar_occupation, compare_docs_with_context,generate_exp,generate_prompt_exp
from data_process import  get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
with open('/app/data/redis_data.json', 'r') as file:
    data_dict = json.load(file)
#r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True)

skill_details_mapping = {}


# Function to retrieve documents based on selected skills
def retrieve_documents(occupation,skills):
    output = []
    output.append(f"<div style=\"text-align: center; font-size: 24px;\">Empfehlungsergebnisse:</div>")
    oc_uri = occupations.get(occupation, "")
    skill_query = ''
    candidate_docs = []
    if isinstance(oc_uri, int):
        df = pd.read_csv("/app/data/berufe_info.csv")
        target_occupation = df[df['id'] == oc_uri]
        target_occupation_name = target_occupation['short name'].values[0]
        target_occupation_dsp = target_occupation['description'].values[0]
        target_occupation_query = target_occupation_name + ' ' + target_occupation_dsp
        target_occupation_query = target_occupation_query
    else:
        target_occupation = get_occupation_detial(oc_uri)
        target_occupation_name, target_occupation_dsp, target_occupation_query = build_occupation_query(target_occupation)
    for german_label in skills:
        skill_query += german_label + ' '
        ocsk_query = target_occupation_name + ' ' + german_label
        skills_docs = retriever.get_relevant_documents(ocsk_query)
        candidate_docs.extend(skills_docs[:2])
    query =  target_occupation_query + ' ' + skill_query
    llama_query = 'info:' + target_occupation_name + ' ' + 'Skills gap:' + skill_query
    print(query)
    docs = retriever.get_relevant_documents(query)
    candidate_docs.extend(docs[:5])

    #remove duplicates
    seen_course_ids = set()
    candidate_doc_unique = []

    for doc in candidate_docs:
        course_id = doc.metadata.get('id','')
        if course_id not in seen_course_ids:
            candidate_doc_unique.append(doc)
            seen_course_ids.add(course_id)

    partial_compare_docs = functools.partial(compare_docs_with_context, target_occupation_name=target_occupation_name, target_occupation_dsp=target_occupation_dsp,skill_gap = skill_query)
    sorted_docs = sorted(candidate_doc_unique, key=functools.cmp_to_key(partial_compare_docs), reverse=True)

    
    batch_prompts = []
    for doc in sorted_docs[:5]:
        doc_name = doc.metadata.get('name', 'Unnamed Document')
        doc_skill = doc.metadata.get('skills', '')
        input_text = f"target occupation: {llama_query}\n Recommended course: name: {doc_name}, learning objectives: {doc_skill[:2000]}"
        prompt = generate_prompt_exp(input_text)
        batch_prompts.append(prompt)

    # Evaluate the current batch of prompts
    batch_output = generate_exp(batch_prompts)
    output.append(f"<b>Zielberuf:</b> {target_occupation_name}")
    output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
    output.append(f"<b>Empfohlene Kurse:</b>")
    for i in range(5):
        doc = sorted_docs[i]
        doc_name = doc.metadata.get('name', 'Unnamed Document')
        doc_url = doc.metadata.get('url', '#')
        doc_skill = doc.metadata.get('skills', '')
        output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>") 
        output.append(f"<b>Empfehlungsgrund:</b> {batch_output[i]}")
    

    output.append(f"<br>")
    return "<br>".join(output)


def get_candidate_courses(occupation, skills):
    output = []
    output.append(f"<div style=\"text-align: center; font-size: 24px;\">Empfehlungsergebnisse:</div>")
    df_lookup = pd.read_csv('/app/data/kldb_isco_lookup.csv')
    df_berufe = pd.read_csv('/app/data/berufe_info.csv')
    occupation_codes = set()
    kldB_set = set()
    occupation_hrefs = set()
    BA_berufe = set()
    oc_uri = occupations.get(occupation, "")
    target_occupation = get_occupation_detial(oc_uri)
    target_occupation_query = build_occupation_query(target_occupation)
    
    for german_label in skills:
        skill = skill_details_mapping.get(german_label, {})
        uri = f'https://ec.europa.eu/esco/api/resource/skill?selectedVersion=v1.0.9&language=en&uri={skill["uri"]}'
        try:
            skill_response = requests.get(uri)
            skill_response.raise_for_status()
            skill_json = skill_response.json()
            
            # Combine essential and optional occupations
            skill_related_occupations = (skill_json['_links'].get('isEssentialForOccupation', []) +
                                          skill_json['_links'].get('isOptionalForOccupation', []))
            
            for occupation in skill_related_occupations:
                href = occupation.get('href')
                if href:
                    occupation_hrefs.add(href)
        except requests.RequestException as e:
            print(f"Error while fetching skill details: {e}")
                
    for href in occupation_hrefs:
        try:
            occupation_response = requests.get(href)
            occupation_response.raise_for_status()
            occupation_details = occupation_response.json()
            
            code = occupation_details.get('code')
            if code:
                occupation_codes.add(code.split('.')[0])
        except requests.RequestException as e:
            print(f"Error while fetching occupation details: {e}")
            
    for isco_code in occupation_codes:
        kldB_codes = df_lookup[df_lookup['isco08'] == int(isco_code)]['kldb2010'].values
        for code in kldB_codes:
            kldB_set.add(str(code))
    dfs = []
    for kldb in kldB_set:
        berufe = df_berufe[df_berufe['KldB codes']=='B '+kldb]
        dfs.append(berufe)

    merged_df = pd.concat(dfs, ignore_index=True)  
    top_k_berufe = find_similar_occupation(target_occupation_query,merged_df,5,'cosine')
    for beruf in top_k_berufe:
        entry_requirement = beruf.metadata['entry_requirements']
        corrected_json_string = entry_requirement.replace("'", '"')
        entry_requirement_json = json.loads(corrected_json_string)
        for js in entry_requirement_json:
            BA_berufe.add(str(js['data_idref']))
                
    result = get_courses_from_BA(BA_berufe)
    courses = result
    for course in courses['_embedded']['termine']:
        output.append(f"<a href='{course['angebot']['link']}' target='_blank'>{course['angebot']['titel']}</a>") 

    return "<br>".join(output)


def get_occupation_skills(oc_uri):
    #skills_json = r.get(oc_uri)
    skills_json = data_dict.get(oc_uri, None)
    skill_labels = []
    if skills_json:
        skills = json.loads(skills_json)
        for skill in skills:
            german_label = skill['preferredLabel']['de']
            skill_details_mapping[german_label] = skill
            skill_labels.append(german_label)
        return skill_labels
    else:
        return skill_labels
    
def get_occupation_skills_BA(oc_uri):
    df = pd.read_csv("/app/data/berufe_info.csv")
    essential_skills = df[df['id'] == oc_uri]['essential skills'].values
    optional_skills = df[df['id'] == oc_uri]['optional skills'].values
    combined_skills = essential_skills[0][:-1] + ',' + optional_skills[0][1:]
    combined_skills = combined_skills.replace("'", "\"")
    skills = json.loads(combined_skills)
    skill_labels = []
    for skill in skills:
        german_label = skill['skill']
        skill_details_mapping[german_label] = skill
        skill_labels.append(german_label)
    return skill_labels

# Function to update the skills dropdown
def update_skills(occupation):
    oc_uri = occupations.get(occupation, "")
    if isinstance(oc_uri, int):
        skills = get_occupation_skills_BA(oc_uri)
        return gr.Dropdown(skills,label="aktuelle Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie derzeit besitzen')
    else:
        skills = get_occupation_skills(oc_uri)
        return gr.Dropdown(skills,label="aktuelle Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie derzeit besitzen')
    return 

def update_skillgap(occupation, current_skills):
    oc_uri = occupations.get(occupation, "")
    if isinstance(oc_uri, int):
        ocupation_skills = get_occupation_skills_BA(oc_uri)
    else:
        ocupation_skills = get_occupation_skills(oc_uri)
    skill_gap = [skill for skill in ocupation_skills if skill not in current_skills]
    
    return gr.Dropdown(skill_gap, label="Qualifikationslücke", multiselect=True, info='Bitte wählen Sie die Fähigkeiten aus, die Sie lernen möchten.')

if __name__ == "__main__":
    # Load occupations from CSV
    occupations_esco = get_occupations_from_csv(CSV_FILE_PATH)
    df = pd.read_csv("/app/data/berufe_info.csv")
    occupations_BA = df[['short name', 'id']].set_index('short name').to_dict()['id']
    occupations = {**occupations_esco, **occupations_BA}
    # Gradio interface
    with gr.Blocks(title="MyEduLife Kursempfehlungssystem") as demo:
        occupation_dropdown = gr.Dropdown(list(occupations.keys()), label="Zielberuf",info='Bitte wählen Sie Ihren Zielberuf aus.')
        currentskill_dropdown = gr.Dropdown([],label="aktuelle Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie derzeit besitzen')
        sb_btn = gr.Button("Absenden")
        skillgap_dropdown = gr.Dropdown([],label="Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie lernen möchten.')
                # Use gr.HTML to display the HTML content
        button = gr.Button("Kursempfehlungen")
        documents_output = gr.HTML()

        occupation_dropdown.change(update_skills, inputs=occupation_dropdown, outputs=currentskill_dropdown)

        sb_btn.click(
                    update_skillgap, 
                    inputs=[occupation_dropdown,currentskill_dropdown], 
                    outputs=skillgap_dropdown
                )

        button.click(
                    retrieve_documents, 
                    inputs=[occupation_dropdown,skillgap_dropdown],
                    outputs=documents_output
                    )
    print('Initialization completed')
    demo.launch(server_name="0.0.0.0", server_port=7860)