Tao Wu commited on
Commit
6b463ef
1 Parent(s): 5aed371
app/app.py CHANGED
@@ -4,7 +4,8 @@ import redis
4
  import json
5
  import requests
6
  from config import *
7
- from embedding_setup import retriever, find_similar_occupation
 
8
  from data_process import build_skill_query, get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
9
  with open('/app/data/redis_data.json', 'r') as file:
10
  data_dict = json.load(file)
@@ -12,7 +13,7 @@ with open('/app/data/redis_data.json', 'r') as file:
12
 
13
  skill_details_mapping = {}
14
 
15
-
16
  # Function to retrieve documents based on selected skills
17
  def retrieve_documents(occupation,skills):
18
  output = []
@@ -22,19 +23,26 @@ def retrieve_documents(occupation,skills):
22
  if isinstance(oc_uri, int):
23
  df = pd.read_csv("/app/data/berufe_info.csv")
24
  target_occupation = df[df['id'] == oc_uri]
25
- target_occupation_query = target_occupation['short name'] + ' ' + target_occupation['description']
26
- target_occupation_query = target_occupation_query.values[0]
 
 
27
  else:
28
  target_occupation = get_occupation_detial(oc_uri)
29
- target_occupation_query = build_occupation_query(target_occupation)
30
  for german_label in skills:
31
  skill_query += german_label + ' '
32
  query = target_occupation_query + ' ' + skill_query
33
  print(query)
34
  docs = retriever.get_relevant_documents(query)
 
 
 
 
 
35
  output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
36
  output.append(f"<b>Empfohlene Kurse:</b>")
37
- for doc in docs:
38
  doc_name = doc.metadata.get('name', 'Unnamed Document')
39
  doc_url = doc.metadata.get('url', '#')
40
  output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
 
4
  import json
5
  import requests
6
  from config import *
7
+ import functools
8
+ from embedding_setup import retriever, find_similar_occupation, evaluate, compare_docs_with_context
9
  from data_process import build_skill_query, get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
10
  with open('/app/data/redis_data.json', 'r') as file:
11
  data_dict = json.load(file)
 
13
 
14
  skill_details_mapping = {}
15
 
16
+ df_course = pd.read_csv('/app/data/all_course_info.csv')
17
  # Function to retrieve documents based on selected skills
18
  def retrieve_documents(occupation,skills):
19
  output = []
 
23
  if isinstance(oc_uri, int):
24
  df = pd.read_csv("/app/data/berufe_info.csv")
25
  target_occupation = df[df['id'] == oc_uri]
26
+ target_occupation_name = target_occupation['short name'].values[0]
27
+ target_occupation_dsp = target_occupation['description'].values[0]
28
+ target_occupation_query = target_occupation_name + ' ' + target_occupation_dsp
29
+ target_occupation_query = target_occupation_query
30
  else:
31
  target_occupation = get_occupation_detial(oc_uri)
32
+ target_occupation_name, target_occupation_dsp, target_occupation_query = build_occupation_query(target_occupation)
33
  for german_label in skills:
34
  skill_query += german_label + ' '
35
  query = target_occupation_query + ' ' + skill_query
36
  print(query)
37
  docs = retriever.get_relevant_documents(query)
38
+
39
+ partial_compare_docs = functools.partial(compare_docs_with_context, df_course=df_course, target_occupation_name=target_occupation_name, target_occupation_dsp=target_occupation_dsp,skll_gap = skill_query)
40
+ sorted_docs = sorted(docs, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
41
+
42
+
43
  output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
44
  output.append(f"<b>Empfohlene Kurse:</b>")
45
+ for doc in sorted_docs:
46
  doc_name = doc.metadata.get('name', 'Unnamed Document')
47
  doc_url = doc.metadata.get('url', '#')
48
  output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
app/data/all_course_info.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a5fbf2e2d50867cb626d82e94d197de14e43d8057e2d26deb7a41551c03cbcc
3
+ size 40197384
app/data_process.py CHANGED
@@ -32,13 +32,15 @@ def build_skill_query(skill):
32
 
33
 
34
  def build_occupation_query(occupation):
35
- occupation_query = occupation['preferredLabel'].get('de','') +" " + occupation['preferredLabel'].get('en','')+" "+ occupation['description'].get('de','').get('literal','') + " "+ occupation['description'].get('en','').get('literal','')
 
 
36
  if occupation['_links']['broaderIscoGroup']:
37
  for group in occupation['_links']['broaderIscoGroup']:
38
  occupation_query += " " + group['title']
39
  else:
40
  pass
41
- return occupation_query
42
 
43
  # Get occupations from a CSV
44
  def get_occupations_from_csv(file_path):
 
32
 
33
 
34
  def build_occupation_query(occupation):
35
+ occupation_name_de = occupation['preferredLabel'].get('de','')
36
+ occupation_dsp = occupation['description'].get('de','').get('literal','')
37
+ occupation_query = occupation_name_de +" " + occupation['preferredLabel'].get('en','')+" "+ occupation['description'].get('de','').get('literal','') + " "+ occupation_dsp
38
  if occupation['_links']['broaderIscoGroup']:
39
  for group in occupation['_links']['broaderIscoGroup']:
40
  occupation_query += " " + group['title']
41
  else:
42
  pass
43
+ return occupation_name_de,occupation_dsp,occupation_query
44
 
45
  # Get occupations from a CSV
46
  def get_occupations_from_csv(file_path):
app/embedding_setup.py CHANGED
@@ -1,10 +1,17 @@
1
  from langchain_community.vectorstores import Chroma
2
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
3
- from sentence_transformers import SentenceTransformer, util
4
  from langchain.docstore.document import Document
5
- import numpy as np
 
6
  from config import *
7
  import os
 
 
 
 
 
 
8
 
9
  os.environ['CURL_CA_BUNDLE'] = ""
10
  embedding_int = HuggingFaceBgeEmbeddings(
@@ -23,6 +30,123 @@ db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_in
23
  retriever = db.as_retriever(search_kwargs={"k": TOP_K})
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
27
 
28
  # Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
 
1
  from langchain_community.vectorstores import Chroma
2
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
3
+
4
  from langchain.docstore.document import Document
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
6
+ from peft import PeftModel
7
  from config import *
8
  import os
9
+ import torch
10
+
11
+ if torch.cuda.is_available():
12
+ device = "cuda"
13
+ else:
14
+ device = "cpu"
15
 
16
  os.environ['CURL_CA_BUNDLE'] = ""
17
  embedding_int = HuggingFaceBgeEmbeddings(
 
30
  retriever = db.as_retriever(search_kwargs={"k": TOP_K})
31
 
32
 
33
+ LLM_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
34
+ lora_weights = "/hpcwork/vg380347/llama3/Instruct_8B_EngGer_alpaca_finetune_pairwise_skill_24_128/last_checkpoint"
35
+
36
+
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, cache_dir="/hpcwork/vg380347/.cache")
39
+ LLM_model = AutoModelForCausalLM.from_pretrained(
40
+ LLM_MODEL, device_map="auto", trust_remote_code=True
41
+ )
42
+
43
+ first_token = 'First'
44
+ second_token = 'Second'
45
+ # 获取token的ID
46
+ first_id = tokenizer.convert_tokens_to_ids(first_token)
47
+ second_id = tokenizer.convert_tokens_to_ids(second_token)
48
+ model = AutoModelForCausalLM.from_pretrained(
49
+ MODEL_NAME,
50
+ torch_dtype=torch.float16,
51
+ device_map="auto",
52
+ )
53
+
54
+ rec_adapter = PeftModel.from_pretrained(
55
+ model,
56
+ lora_weights,
57
+ torch_dtype=torch.float16,
58
+ device_map={'': 0}
59
+ )
60
+
61
+ tokenizer.padding_side = "left"
62
+ # unwind broken decapoda-research config
63
+ #model.half() # seems to fix bugs for some users.
64
+ rec_adapter.eval()
65
+
66
+ rec_adapter.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
67
+ rec_adapter.config.bos_token_id = 1
68
+ rec_adapter.config.eos_token_id = 2
69
+
70
+ def generate_prompt(target_occupation, skill_gap, courses):
71
+ return f"""
72
+ ### Instruction:
73
+ "As an education expert, you have been provided with a target occupation, a skill gap, and information on two candidate courses. Your task is to determine which course better matches the target occupation and skill gap. Please respond with 'First' or 'Second' to indicate your recommendation.
74
+
75
+ ### Input:
76
+ Target Occupation: {target_occupation}
77
+ Skill Gap: {skill_gap}
78
+ candidate courses: {courses}
79
+
80
+ ### Response:
81
+ """
82
+ '''
83
+ prompt_re = ChatPromptTemplate.from_template(template_re)
84
+ chain_re = (
85
+ runnable
86
+ | prompt_re
87
+ )
88
+ '''
89
+ def evaluate(
90
+ prompt=None,
91
+ temperature=0,
92
+ top_p=1.0,
93
+ top_k=40,
94
+ num_beams=1,
95
+ max_new_tokens=120,
96
+ batch_size=1,
97
+ **kwargs,
98
+ ):
99
+
100
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
101
+ generation_config = GenerationConfig(
102
+ temperature=temperature,
103
+ top_p=top_p,
104
+ top_k=top_k,
105
+ num_beams=num_beams,
106
+ **kwargs,
107
+ )
108
+ with torch.no_grad():
109
+ generation_output = model.generate(
110
+ **inputs,
111
+ generation_config=generation_config,
112
+ return_dict_in_generate=True,
113
+ output_scores=True,
114
+ max_new_tokens=max_new_tokens,
115
+ # batch_size=batch_size,
116
+ eos_token_id=tokenizer.eos_token_id,
117
+ pad_token_id=tokenizer.eos_token_id,
118
+ )
119
+ scores = generation_output.scores[0].softmax(dim=-1)
120
+ logits = torch.tensor(scores[:,[first_id, second_id]], dtype=torch.float32).softmax(dim=-1)
121
+ s = generation_output.sequences
122
+ output = tokenizer.batch_decode(s, skip_special_tokens=True)
123
+ output = [_.split('Response:\n')[-1] for _ in output]
124
+ return output, logits.tolist()
125
+
126
+ def compare_docs_with_context(doc_a, doc_b, df_course, target_occupation_name, target_occupation_dsp,skill_gap):
127
+ # Extract course details from the data frame
128
+ course_a = df_course[df_course['course_id'] == int(doc_a.metadata['id'])].iloc[0]
129
+ course_b = df_course[df_course['course_id'] == int(doc_b.metadata['id'])].iloc[0]
130
+ print('comapring...')
131
+ print(course_a['course_name'], course_b['course_name'])
132
+ # Prepare the input for chain_re.invoke
133
+
134
+ courses = f"First: name: {course_a['course_name']} description:{course_a['course_content_limited']} Second: name: {course_b['course_name']} description:{course_b['course_content_limited']}"
135
+ #courses = f"First: name: {course_a['course_name']} skills:{course_a['course_skills_edu']} Second: name: {course_b['course_name']} skills:{course_b['course_skills_edu']}"
136
+ target_occupation = f"name: {target_occupation_name} description: {target_occupation_dsp}"
137
+ skill_gap = skill_gap
138
+ prompt = generate_prompt(target_occupation, skill_gap, courses)
139
+ prompt = [prompt]
140
+ output, logit = evaluate(prompt)
141
+ # Compare based on the response: [A] means doc_a > doc_b, [B] means doc_a < doc_b
142
+ print(output, logit)
143
+ if logit[0][0] > logit[0][1]:
144
+ return 1 # doc_a should come before doc_b
145
+ elif logit[0][0] < logit[0][1]:
146
+ return -1 # doc_a should come after doc_b
147
+ else:
148
+ return 0 # Consider them equal if the response is unclear
149
+
150
  def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
151
 
152
  # Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.