File size: 6,433 Bytes
b5a35fb
0971dea
 
 
 
 
523e4cc
0971dea
 
 
94dabc7
69bd769
 
 
2830df9
69bd769
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import torch
import requests
import numpy as np
import pandas as pd
import gradio as gr
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
from torch.nn.functional import cosine_similarity

df = pd.read_csv("cleaned_data.csv")

bert_model = SentenceTransformer('all-MiniLM-L6-v2')
df["course_embedding"] = df["Transformed_description"].apply(lambda x: bert_model.encode(x, convert_to_tensor=True))
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

def recommend_courses(skills, interests, experience, education, time, certificates, careerpath):
    try:
        # Create initial user profile
        user_profile = {
            "skills": [s.strip() for s in skills.split(",") if s.strip()],
            "interests": [s.strip() for s in interests.split(",") if s.strip()],
            "experience": [s.strip() for s in experience.split(",") if s.strip()],
            "education": [s.strip() for s in education.split(",") if s.strip()],
            "time": [s.strip() for s in time.split(",") if s.strip()],
            "certificates": [s.strip() for s in certificates.split(",") if s.strip()],
            "careerpath": [s.strip() for s in careerpath.split(",") if s.strip()]
        }

        # Get skill level assessment
        response = client.models.generate_content(
            model="gemini-pro",
            contents=f"""
            Give the current skill level in one word out of 'beginner', 'intermediate', 'advanced'.
            Here is the user profile: {user_profile}
            strictly do not output any extra textual data."""
        )
        
        CurrentSkill = response.text.strip().replace("\n", "")
        user_profile["CurrentSkill"] = [CurrentSkill]

        user_text = " ".join(user_profile["skills"] + user_profile["interests"] + user_profile["experience"] + user_profile["education"] + user_profile["time"] + user_profile["certificates"] + user_profile["careerpath"] + user_profile["CurrentSkill"])

        # # Create weighted user text representation
        # user_text = " ".join([
        #     " ".join(user_profile["skills"]) * 3,
        #     " ".join(user_profile["interests"]) * 2,
        #     " ".join(user_profile["careerpath"]) * 2,
        #     " ".join(user_profile["experience"]),
        #     " ".join(user_profile["education"]),
        #     " ".join(user_profile["certificates"]),
        #     " ".join(user_profile["CurrentSkill"]) * 2
        # ])

        user_embedding = bert_model.encode(user_text, convert_to_tensor=True)
        course_embeddings = torch.stack(df["course_embedding"].tolist())
        similarities = cosine_similarity(user_embedding, course_embeddings)
        # similarities = cosine_similarity(user_embedding.unsqueeze(0), course_embeddings)[0]

        # Original weighting scheme

        weights = {
            "similarity": 0.6,
            "rating": 0.2,
            "difficulty": 0.1,
            "time_to_complete": 0.1
        }

        df["normalized_rating"] = (df["course_rating"] - df["course_rating"].min()) / (
            df["course_rating"].max() - df["course_rating"].min())
        df["normalized_difficulty"] = 1 - (df["course_difficulty"] / df["course_difficulty"].max())

        df["ranking_score"] = (
            weights["similarity"] * similarities.cpu().numpy() +
            weights["rating"] * df["normalized_rating"].values +
            weights["difficulty"] * df["normalized_difficulty"].values
        )

        top_courses = df.sort_values(by="ranking_score", ascending=False).head(6)
        output = top_courses["course_name"].tolist()

        response2 = client.models.generate_content(
            model="gemini-pro",
            contents=f"""
            Return a JSON object with this exact structure:
            {{
                "beginner": [
                    {{"name": "course name", "url": "course url"}}
                ],
                "intermediate": [
                    {{"name": "course name", "url": "course url"}}
                ],
                "advanced": [
                    {{"name": "course name", "url": "course url"}}
                ]
            }}
            
            Categorize these courses: {output}

            Add Url of the specific course from {df["course_url"]}
            Based on:
            - User skill level: {CurrentSkill}
            - Course difficulties: {top_courses['normalized_difficulty'].tolist()}
            - User skills: {user_profile['skills']}
            
            Categorise atleast one course for each beginner, intermediate and advanced.
            Return ONLY valid JSON without any extra text.
            """
        )

        try:
            json_response = json.loads(response2.text.strip().replace('```json', '').replace('```', ''))
            # Validate structure
            for level in ['beginner', 'intermediate', 'advanced']:
                if level not in json_response:
                    json_response[level] = []
                else:
            # Ensure each course has name and url
                    for course in json_response[level]:
                        if not isinstance(course, dict) or 'name' not in course or 'url' not in course:
                            json_response[level] = []
                        break
            return json_response
        except:
            return {
                "beginner": [],
                "intermediate": [],
                "advanced": [],
                "error": "Failed to categorize courses"
            }

    except Exception as e:
        return {"error": str(e)}

# Create Gradio interface
iface = gr.Interface(
    fn=recommend_courses,
    inputs=[
        gr.Textbox(label="Skills", placeholder="python, machine learning"),
        gr.Textbox(label="Interests", placeholder="AI, data science"),
        gr.Textbox(label="Experience", placeholder="2 years python"),
        gr.Textbox(label="Education", placeholder="bachelor's in CS"),
        gr.Textbox(label="Time Available", placeholder="6 months"),
        gr.Textbox(label="Certificates", placeholder="AWS, GCP"),
        gr.Textbox(label="Career Path", placeholder="ML engineer")
    ],
    outputs=gr.JSON(),
    title="Personalized Course Recommender",
    description="Enter your profile details to get course recommendations organized by difficulty level"
)

if __name__ == "__main__":
    iface.launch(share=True)