HuyDN commited on
Commit
097744c
1 Parent(s): 4ed3277

Phase1/HuyDN: Done Base Phase 1

Browse files
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  venv/
2
  .env
3
  __pycache__/
 
 
1
  venv/
2
  .env
3
  __pycache__/
4
+ credentials/
app/configs/database.py CHANGED
@@ -1,86 +1,43 @@
1
  import firebase_admin
2
- from firebase_admin import credentials, storage, db
3
-
4
- storageBucket = "fsa-firebase-tutorial.appspot.com"
5
- databaseURL = "https://fsa-firebase-tutorial-default-rtdb.asia-southeast1.firebasedatabase.app/"
6
-
7
- cred = credentials.Certificate('credentials\\fsa-firebase-database.json')
8
- firebase_admin.initialize_app(cred, {
9
- 'storageBucket': storageBucket,
10
- 'databaseURL': databaseURL
11
- })
12
-
13
-
14
- # Upload file to storage
15
- def upload_file(file_path, destination_path):
16
- bucket = storage.bucket(storageBucket)
17
- ref = bucket.blob(destination_path)
18
- ref.upload_from_filename(file_path)
19
- return True
20
-
21
- # Add file's url and metadata to realtime database
22
- def store_metadata(file_url, description):
23
- database = db.reference('exam_data') # Or your desired path
24
- new_file_ref = database.push()
25
- new_file_ref.set({
26
- 'file_url': file_url,
27
- 'description': description
 
 
 
 
 
 
 
28
  })
29
- return True
30
-
31
- # Query file's url and metadata from file's metadata
32
- def query_file_from_metadata(target_description):
33
- temp_file_dict = {}
34
- database_json = db.reference('exam_data').get()
35
- for key, value in database_json.items():
36
- if (target_description in value.get('description')):
37
- temp_file_dict.update({key: value})
38
- return temp_file_dict
39
-
40
- # Query database path
41
- def query_file(path):
42
- database = db.reference(path)
43
- ref = database.get()
44
- return ref
45
-
46
- # Extract file's url from a bunch of things
47
- def extract_file_url(file_dict):
48
- file_urls = []
49
- for key, value in file_dict.items():
50
- file_urls.append(value['file_url'])
51
- return file_urls
52
-
53
- # Extract file's url with target description from a bunch of things
54
- def extract_file_url_by_description(file_dict, target_description):
55
- file_urls = []
56
- for key, value in file_dict.items():
57
- if value.get('description') == target_description:
58
- file_urls.append(value.get('file_url'))
59
- return file_urls
60
 
61
- # Read file content, given its url
62
- def read_from_file_url(file_url_list):
63
- temp_content_list = []
64
- bucket = storage.bucket(storageBucket)
65
- for file_url in file_url_list:
66
- parts = []
67
- parts = file_url.split('/')
68
- legal_file_url = parts[3:][0]
69
- ref = bucket.blob(legal_file_url)
70
- file_content = ref.download_as_string().decode('utf-8')
71
- temp_content_list.append(file_content)
72
- return temp_content_list
73
 
74
- # Download file content, given its url
75
- def download_from_file_url(file_url_list, local_directory):
76
- temp_content_list = []
77
- bucket = storage.bucket(storageBucket)
78
- for file_url in file_url_list:
79
- parts = []
80
- parts = file_url.split('/')
81
- legal_file_url = parts[3:][0]
82
- ref = bucket.blob(legal_file_url)
83
- local_filename = f"{local_directory}/{legal_file_url}"
84
- ref.download_to_filename(local_filename)
85
- print(f"Downloaded {legal_file_url} to {local_filename}")
86
- return True
 
1
  import firebase_admin
2
+ from firebase_admin import credentials
3
+ from firebase_admin import firestore
4
+ from firebase_admin import storage
5
+
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+ # load the environment variables
10
+ load_dotenv()
11
+
12
+ firebase_url_storageBucket = os.getenv("FIREBASE_URL_STORAGEBUCKET")
13
+
14
+ # get credentials from .env
15
+ credential_firebase = {
16
+ "type": os.getenv("FIREBASE_TYPE"),
17
+ "project_id": os.getenv("FIREBASE_PROJECT_ID"),
18
+ "private_key_id": os.getenv("FIREBASE_PRIVATE_KEY_ID"),
19
+ "private_key": os.getenv("FIREBASE_PRIVATE_KEY").replace('\\n', '\n'),
20
+ "client_email": os.getenv("FIREBASE_CLIENT_EMAIL"),
21
+ "client_id": os.getenv("FIREBASE_CLIENT_ID"),
22
+ "auth_uri": os.getenv("FIREBASE_AUTH_URI"),
23
+ "token_uri": os.getenv("FIREBASE_TOKEN_URI"),
24
+ "auth_provider_x509_cert_url": os.getenv("FIREBASE_AUTH_PROVIDER_X509_CERT_URL"),
25
+ "client_x509_cert_url": os.getenv("FIREBASE_CLIENT_X509_CERT_URL"),
26
+ "universe_domain": os.getenv("FIREBASE_UNIVERSE_DOMAIN")
27
+ }
28
+
29
+ # check if firebase is not initialized
30
+ if not firebase_admin._apps:
31
+ # Initialize the app with a service account, granting admin privileges
32
+ cred = credentials.Certificate(credential_firebase)
33
+ firebase_admin.initialize_app(cred, {
34
+ 'storageBucket': firebase_url_storageBucket
35
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # Initialize Firestore
38
+ firebase_db = firestore.client()
39
+ print("Firestore connected")
 
 
 
 
 
 
 
 
 
40
 
41
+ # Initialize Storage
42
+ firebase_bucket = storage.bucket(app=firebase_admin.get_app())
43
+ print("Storage connected")
 
 
 
 
 
 
 
 
 
 
app/modules/__init__.py CHANGED
@@ -1,8 +1,10 @@
1
  from fastapi import APIRouter
2
- from app.modules.question_retrieval import qtretrieval_router
 
3
 
4
  modules_router = APIRouter(prefix="/modules", tags=["modules"])
5
  modules_router.include_router(qtretrieval_router)
 
6
 
7
  @modules_router.get("/")
8
  async def index():
 
1
  from fastapi import APIRouter
2
+ from app.modules.question_tests_retrieval import qtretrieval_router
3
+ from app.modules.crud_question_test import crud_question_tests_router
4
 
5
  modules_router = APIRouter(prefix="/modules", tags=["modules"])
6
  modules_router.include_router(qtretrieval_router)
7
+ modules_router.include_router(crud_question_tests_router)
8
 
9
  @modules_router.get("/")
10
  async def index():
app/modules/crud_question_test/__init__.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File
2
+ from typing import Annotated
3
+
4
+ from app.modules.crud_question_test.models.crud_question_tests import get_all_question_tests, get_question_test_by_id, create_question_test, update_question_test, delete_question_test
5
+
6
+ crud_question_tests_router = APIRouter(prefix="/crud_question_tests_router", tags=["crud_question_tests_router"])
7
+
8
+ # [GET] all question tests
9
+ @crud_question_tests_router.get("/")
10
+ async def index():
11
+ # Get all documents from the collection with id document
12
+ data = get_all_question_tests()
13
+ return data
14
+
15
+ # [POST] add question test
16
+ @crud_question_tests_router.post("/")
17
+ # only upload pdf or json file
18
+ async def add_question_test(description: str, role: str, file_question_tests: Annotated[UploadFile, File(..., description="The question test file", media_type=["application/pdf", "application/json"])]):
19
+ try:
20
+ # check if file is pdf or json
21
+ if file_question_tests.content_type == "application/pdf":
22
+ # create a new document
23
+ if create_question_test({"description": description, "role": role, "question_tests": file_question_tests}):
24
+ return {"message": "Question test added successfully"}
25
+ else:
26
+ return {"message": "Error"}
27
+ elif file_question_tests.content_type == "application/json":
28
+ # create a new document
29
+ if create_question_test({"question_tests_description": description, "question_tests_role": role, "question_tests_url": file_question_tests}):
30
+ return {"message": "Question test added successfully"}
31
+ else:
32
+ return {"message": "Error"}
33
+ else:
34
+ return {"message": "File type not supported"}
35
+ except Exception as e:
36
+ return {"message": "Error", "error": str(e)}
37
+
38
+ # [DELETE] question test by id
39
+ @crud_question_tests_router.delete("/{id}")
40
+ async def delete_question_test_by_id(id: str):
41
+ # Delete a document by id
42
+ if delete_question_test(id):
43
+ return {"message": f"Question test have id {id} deleted successfully"}
44
+ else:
45
+ return {"message": "Error"}
app/modules/crud_question_test/models/crud_question_tests.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.configs.database import firebase_bucket, firebase_db
2
+
3
+ # CRUD operation
4
+ def upload_file_question_tests(file):
5
+ # upload file to firebase storage
6
+ blob = firebase_bucket.blob(file.filename)
7
+ blob.upload_from_file(file.file)
8
+ # return gs link
9
+ return f"gs://{firebase_bucket.name}/{file.filename}"
10
+
11
+ def remove_file_question_tests(file_url):
12
+ # remove file from firebase storage using "gs://" link
13
+ blob = firebase_bucket.blob(file_url.split(f"gs://{firebase_bucket.name}/")[1])
14
+ blob.delete()
15
+ return True
16
+
17
+ def get_all_question_tests():
18
+ # Get all documents from the collection
19
+ docs = firebase_db.collection("question_tests").stream()
20
+ data = []
21
+ for doc in docs:
22
+ doc_data = doc.to_dict()
23
+ doc_data["id"] = doc.id
24
+ data.append(doc_data)
25
+ return data
26
+
27
+ def get_question_test_by_id(id):
28
+ # Get a document by id
29
+ doc = firebase_db.collection("question_tests").document(id).get()
30
+ return doc.to_dict()
31
+
32
+ def get_question_test_url_by_description(description):
33
+ # Get a question_tests_url where question_tests_description is equal to description
34
+ docs = firebase_db.collection("question_tests").where("question_tests_description", "==", description).stream()
35
+ for doc in docs:
36
+ return doc.to_dict()["question_tests_url"]
37
+ return False
38
+
39
+
40
+ def create_question_test(data):
41
+ # get file_question_tests
42
+ file_question_tests = data["question_tests_url"]
43
+ # upload file to firebase storage
44
+ file_url = upload_file_question_tests(file_question_tests)
45
+ # add file url to data
46
+ data["question_tests_url"] = file_url
47
+ # Create a new document
48
+ firebase_db.collection("question_tests").add(data)
49
+ return True
50
+
51
+ def update_question_test(id, data):
52
+ # Update a document by id
53
+ firebase_db.collection("question_tests").document(id).update(data)
54
+ return True
55
+
56
+ def delete_question_test(id):
57
+ # Delete a file from firebase storage
58
+ file_url = get_question_test_by_id(id)["question_tests"]
59
+ remove_file_question_tests(file_url)
60
+ # Delete a document by id
61
+ firebase_db.collection("question_tests").document(id).delete()
62
+ return True
app/modules/{question_retrieval → question_tests_retrieval}/__init__.py RENAMED
@@ -1,8 +1,9 @@
1
  from fastapi import APIRouter, UploadFile, File
2
  from typing import Annotated
3
 
4
- from app.modules.question_retrieval.models.jd2text import jobdes2text
5
- from app.modules.question_retrieval.models.text2tector import text2vector
 
6
 
7
  qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
8
 
@@ -14,12 +15,16 @@ async def index():
14
  # only upload .txt file
15
  async def send_jd(txt_file: Annotated[UploadFile, File(..., description="The JD file", media_type=["text/plain"])]):
16
  try:
17
- # read the txt file with format
18
- jobdes = txt_file.file.read().decode("utf-8")
19
- sumaryjd_text = jobdes2text(jobdes)
20
- print("sumaryjd_text: ", sumaryjd_text)
21
- sumaryjd_vector = text2vector(sumaryjd_text)
22
- print("sumaryjd_vector: ", sumaryjd_vector)
23
- return {"message": "Send JD successfully"}
 
 
 
 
24
  except Exception as e:
25
  return {"message": "Error", "error": str(e)}
 
1
  from fastapi import APIRouter, UploadFile, File
2
  from typing import Annotated
3
 
4
+ from app.modules.question_tests_retrieval.models.jd2text import jobdes2text
5
+ from app.modules.question_tests_retrieval.models.text2tector import text2vector
6
+ from app.modules.question_tests_retrieval.models.question_tests_logic import get_question_test
7
 
8
  qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
9
 
 
15
  # only upload .txt file
16
  async def send_jd(txt_file: Annotated[UploadFile, File(..., description="The JD file", media_type=["text/plain"])]):
17
  try:
18
+ # # read the txt file with format
19
+ # jobdes = txt_file.file.read().decode("utf-8")
20
+ # sumaryjd_text = jobdes2text(jobdes)
21
+ # print("sumaryjd_text: ", sumaryjd_text)
22
+ # sumaryjd_vector = text2vector(sumaryjd_text)
23
+ # print("sumaryjd_vector: ", sumaryjd_vector)
24
+ text = "Job Title is Senior AI Engineer, Level is Senior, and Brief summary of required skills is NLP, experiencing in using containers"
25
+ if get_question_test(text):
26
+ return {"message": "Send JD successfully and get question test successfully"}
27
+ else:
28
+ return {"message": "Error"}
29
  except Exception as e:
30
  return {"message": "Error", "error": str(e)}
app/modules/{question_retrieval → question_tests_retrieval}/models/jd2text.py RENAMED
File without changes
app/modules/question_tests_retrieval/models/question_tests_logic.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
+ from langchain.evaluation import load_evaluator
6
+ from langchain.evaluation import EmbeddingDistance
7
+
8
+ from app.modules.crud_question_test.models.crud_question_tests import get_all_question_tests, get_question_test_url_by_description
9
+ from app.configs.database import firebase_bucket
10
+
11
+ # Import API key
12
+ load_dotenv()
13
+
14
+ # Define the google api key
15
+ os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
16
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
17
+
18
+ # Setting model embedding
19
+ embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
20
+ gemini_evaluator = load_evaluator("embedding_distance", distance_metric=EmbeddingDistance.COSINE, embeddings=embedding_model)
21
+
22
+
23
+ def compare_vector(vector_extract, vector_des):
24
+ maxnimun_value = 2
25
+ for item in vector_des:
26
+ two_object = (vector_extract, item)
27
+ x = gemini_evaluator.evaluate_strings(prediction=two_object[0], reference=two_object[1])
28
+ # print(f"item and score: {item} - {x.get('score')}")
29
+ if x.get('score') < maxnimun_value:
30
+ maxnimun_value = x.get('score')
31
+ print("maxnimun_value: ", maxnimun_value)
32
+ des_item_choose = item
33
+ if maxnimun_value == 2:
34
+ return False
35
+ elif maxnimun_value < 0.3:
36
+ return des_item_choose
37
+ else:
38
+ return False
39
+
40
+ def download_question_test(question_test_url):
41
+ # check folder exist
42
+ if not os.path.exists('data/question_tests'):
43
+ os.makedirs('data/question_tests')
44
+ # download file from firebase storage using "gs://" link
45
+ name_bucket = question_test_url.split(f"gs://{firebase_bucket.name}/")[1]
46
+ blob = firebase_bucket.blob(name_bucket)
47
+ blob.download_to_filename(f'data/question_tests/{name_bucket}')
48
+ return True
49
+
50
+
51
+
52
+ def get_question_test(text):
53
+ all_question_tests = get_all_question_tests()
54
+ value_in_des = []
55
+ for item in all_question_tests:
56
+ value_in_des.append(item['question_tests_description'])
57
+ des_item_choose = compare_vector(text, value_in_des)
58
+ if des_item_choose == False:
59
+ return "No question test found"
60
+ else:
61
+ question_test_url = get_question_test_url_by_description(des_item_choose)
62
+ if download_question_test(question_test_url):
63
+ return True
64
+ else:
65
+ return False
app/modules/{question_retrieval → question_tests_retrieval}/models/text2tector.py RENAMED
@@ -1,7 +1,10 @@
1
  import os
2
 
3
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
 
4
 
 
 
5
  # Define the google api key
6
  os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
7
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
 
1
  import os
2
 
3
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
4
+ from dotenv import load_dotenv
5
 
6
+ # load the environment variables
7
+ load_dotenv()
8
  # Define the google api key
9
  os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
10
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
credentials/fsa-firebase-database.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "type": "service_account",
3
- "project_id": "fsa-firebase-tutorial",
4
- "private_key_id": "89175f3dd111886b549bda9d5a8163ed1fe83900",
5
- "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDDUXR7JQkfiZRe\nkAFNutX4eIeJxrb00Hpkkbe9Doj+M+aYRrPyXsjEn9o0ZGeomSUwVqDHQ2Tby0lS\nlarvYnML7RaE5bIcIMuZ8CDyKsu9ZyxMIlOm4deJnOP9IfN4ascHxiiMHJBsHsLG\nVHQqmx3XJrfRVGELsxLF/0+fwQOdSiPPhU4MKxqmOjHmCLC9IJLnhrAKyQ1LgV+l\n+6fXc7EGtRVdjUCwQGZMv11UaJbjQqZSCFRpabsyw0/RenPmdyYa35R8F48YaiDR\n/WdxqqMTNbZrbhigiz3rIZmropyUdoPT/fKKZstP2vpFj8PbmtgThqlFEAo5l/d8\n5dgdHXCJAgMBAAECggEADeLexooKqOG3yRTRKOjzgTW1gi288HEZgIitIN11jyga\nuQEO1liXisYoJfEs/vs7XgFMrzoLA8bjTMIsC9aqogKORy7K783Dl+tO9lSlVU8O\nYs/OdXcJQ0UCnwqwxAMIaTS1CYgW2v8BX0ceFrDeSV988l6ewy2NNEM7cprPAlnE\n65uO5Es3/R3pLyKORwxkvfY3AVWksXR8FtUdxyMSLjdLqNZ0jxdltMttzyHSvTmS\nLBvXoEznI8bEvvrQwZryOYPvM0sHQSlzyFpJbx/WhBfZ/isZ5AwXZEO82g8zQE2c\nTnK50Wu8AjtynlZhRPEEvu6//2yE9ufCK64/kxcT0QKBgQD6eUn80r4UEWWJUCNU\noLjsuvJAS3usQwwLCrq/FZSx4b53RdA0BADy4n1u0+6ps1EmnN3bdSiuFCZ9DltL\neP4uhwUtz4pJEnFeVUx1SuzzqPn64LKukrGkYqSma8XXx2ZmUp54RsRo8cTU8oAw\nkuipIOj/ynDXbiGN/X0w827seQKBgQDHoKOx/K/sg1nBUHESJudAgpTrSKwUIMgL\nc/qHLanuh2pnK320tqJtkt3hsmC6fiwIU8kX/hSXiB81aZT02rAlZS8eQOftReg5\nlHF/m2B5PPQvamSZ4XUAon1XliXB1z3q/uw72Qc+KzYxUFdb/mBugXYEgo2md7eZ\nCJlaUDWAkQKBgQDERaT3q8BtA1DYb+avmx4tyzSnsz6fNw7J3EeIEwYo+eeBmUBU\nckq4CUkXPpWHf094/XVQUhdlib67QjPCCXbMyNsDEd2f8wbJT/gQ9XyGVwREVi1O\nZP80/GNfUZZ/xu30AovRXkrsnX+LCtfuuD/+wB40ytLvzYoXtOqUID3sqQKBgAym\ncFkiEx/gvvpSF6g30OW0oK08QLNR1HttUQ3p5d/YsvRwvBmMjUR2TxszS4/l2PJ6\nJL8hebqUhK3E0AyoIwtrmw3GvSu7B1lGv95/QSDNrQeoLVHXCKwOn/TB7giXlDPl\nDaFJQn09DcUYO0D4ILI///OPNly7L2ZCNj2T4aARAoGANEfSQY9v2X21VcuKf5Gl\nIZ0682yvtUmkX0EF8mNSz11GIscWGjQWtEi1v+BSYB0By4mHkg67lOjKcYfaFwqT\nYCPsJgl9pfreraT7hDt8CjJo6QfIcgNhbseCyTF3qqUxQ47aN9c9n0YCd/p8lCvB\ndcbch0+stt5cRoZwXcxGSEo=\n-----END PRIVATE KEY-----\n",
6
- "client_email": "firebase-adminsdk-yn0hp@fsa-firebase-tutorial.iam.gserviceaccount.com",
7
- "client_id": "104598090077479952875",
8
- "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
- "token_uri": "https://oauth2.googleapis.com/token",
10
- "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
- "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/firebase-adminsdk-yn0hp%40fsa-firebase-tutorial.iam.gserviceaccount.com",
12
- "universe_domain": "googleapis.com"
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/question_tests/Question_AI_Senior_1.json ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__count__": 30,
3
+ "data": [
4
+ {
5
+ "id": "1",
6
+ "question": "What is the concept of 'Curse of Dimensionality' in Machine Learning?",
7
+ "choices": [
8
+ "A. It refers to the difficulty in visualizing high-dimensional data.",
9
+ "B. It refers to the phenomenon where the feature space becomes increasingly sparse for an increasing number of dimensions of a fixed-size training dataset.",
10
+ "C. It refers to the computational complexity of mathematical operations in high-dimensional spaces.",
11
+ "D. It refers to the overfitting that occurs when the model is too complex."
12
+ ],
13
+ "explanation": "The 'Curse of Dimensionality' refers to various phenomena that arise when analyzing and organizing data in high-dimensional spaces (often with hundreds or thousands of dimensions) that do not occur in low-dimensional settings such as the three-dimensional physical space of everyday experience.",
14
+ "answer": "B",
15
+ "level": "Senior",
16
+ "domain": "AI"
17
+ },
18
+ {
19
+ "id": "2",
20
+ "question": "What is the difference between 'one-hot encoding' and 'label encoding'?",
21
+ "choices": [
22
+ "A. 'One-hot encoding' is used for ordinal variables, while 'label encoding' is used for nominal variables.",
23
+ "B. 'Label encoding' is used for ordinal variables, while 'one-hot encoding' is used for nominal variables.",
24
+ "C. Both 'one-hot encoding' and 'label encoding' are used for ordinal variables.",
25
+ "D. Both 'one-hot encoding' and 'label encoding' are used for nominal variables."
26
+ ],
27
+ "explanation": "'Label encoding' and 'one-hot encoding' are two common methods used to convert categorical data into numerical data. 'Label encoding' assigns each unique category in a categorical variable with an integer. No new columns are created. On the other hand, 'one-hot encoding' creates new (binary) columns, indicating the presence of each possible value from the original data.",
28
+ "answer": "B",
29
+ "level": "Senior",
30
+ "domain": "AI"
31
+ },
32
+ {
33
+ "id": "3",
34
+ "question": "What is 'Early Stopping' in model training?",
35
+ "choices": [
36
+ "A. It's a form of regularization used to avoid overfitting when training a learner with an iterative method.",
37
+ "B. It's a technique used to speed up training by stopping the training process before it completes all iterations.",
38
+ "C. It's a method used to stop training when the model's performance starts to decrease.",
39
+ "D. It's a technique used to stop training when the model's performance does not improve on an held-out validation set."
40
+ ],
41
+ "explanation": "'Early stopping' is a form of regularization used to avoid overfitting when training a machine learning model with an iterative method, such as gradient descent. Such methods update the model's parameters (such as the weights in a neural network) iteratively, and early stopping halts this iterative process when the model's performance stops improving on a held-out validation dataset.",
42
+ "answer": "A",
43
+ "level": "Senior",
44
+ "domain": "AI"
45
+ },
46
+ {
47
+ "id": "4",
48
+ "question": "What is the difference between 'Random Forest' and 'Gradient Boosting' algorithms?",
49
+ "choices": [
50
+ "A. 'Random Forest' is a bagging algorithm, while 'Gradient Boosting' is a boosting algorithm.",
51
+ "B. 'Gradient Boosting' is a bagging algorithm, while 'Random Forest' is a boosting algorithm.",
52
+ "C. Both 'Random Forest' and 'Gradient Boosting' are bagging algorithms.",
53
+ "D. Both 'Random Forest' and 'Gradient Boosting' are boosting algorithms."
54
+ ],
55
+ "explanation": "'Random Forest' is a bagging algorithm and 'Gradient Boosting' is a boosting algorithm. Both are ensemble methods, but they combine the models in different ways. In 'Random Forest', each tree in the ensemble is built from a sample drawn with replacement (i.e., a bootstrap sample) from the training set. In 'Gradient Boosting', each new tree is fit on a modified version of the original data set.",
56
+ "answer": "A",
57
+ "level": "Senior",
58
+ "domain": "AI"
59
+ },
60
+ {
61
+ "id": "5",
62
+ "question": "What is 'Transfer Learning' in the context of Machine Learning?",
63
+ "choices": [
64
+ "A. It's a technique where a pre-trained model is used on a new problem.",
65
+ "B. It's a technique to transfer knowledge from one model to another.",
66
+ "C. It's a technique to transfer knowledge from one problem domain to another.",
67
+ "D. All of the above."
68
+ ],
69
+ "explanation": "'Transfer Learning' is a research problem in machine learning that focuses on storing knowledge gained while solving one problem and applying it to a different but related problem. For example, knowledge gained while learning to recognize cars could apply when trying to recognize trucks.",
70
+ "answer": "D",
71
+ "level": "Senior",
72
+ "domain": "AI"
73
+ },
74
+ {
75
+ "id": "6",
76
+ "question": "What is the purpose of 'ReLU' in a neural network?",
77
+ "choices": [
78
+ "A. To introduce non-linearity in the neural network.",
79
+ "B. To normalize the output of the neural network.",
80
+ "C. To speed up the training process of the neural network.",
81
+ "D. To prevent overfitting in the neural network."
82
+ ],
83
+ "explanation": "'ReLU' stands for 'Rectified Linear Unit'. It is the most commonly used activation function in neural networks and deep learning models. The function returns 0 if it receives any negative input, but for any positive value 'x' it returns that value back. It's used to introduce non-linearity in the neural network.",
84
+ "answer": "A",
85
+ "level": "Senior",
86
+ "domain": "AI"
87
+ },
88
+ {
89
+ "id": "7",
90
+ "question": "What is 'Batch Normalization' in the context of Neural Networks?",
91
+ "choices": [
92
+ "A. It's a technique to provide any layer in a neural network with inputs that are zero mean/unit variance.",
93
+ "B. It's a technique to normalize the output of a previous activation layer by subtracting the batch mean and dividing by the batch standard deviation.",
94
+ "C. It's a technique to make the weights of a neural network have zero mean and unit variance.",
95
+ "D. Both A and B."
96
+ ],
97
+ "explanation": "'Batch Normalization' is a technique to provide any layer in a neural network with inputs that are zero mean/unit variance, and it is a technique to normalize the output of the previous activation layer by subtracting the batch mean and dividing by the batch standard deviation. This technique helps to speed up learning in deep neural networks by reducing internal covariate shift, and it has become a standard component of most state-of-the-art neural networks.",
98
+ "answer": "D",
99
+ "level": "Senior",
100
+ "domain": "AI"
101
+ },
102
+ {
103
+ "id": "8",
104
+ "question": "What is the purpose of 'L1' and 'L2' regularization?",
105
+ "choices": [
106
+ "A. They are techniques to prevent overfitting in a machine learning model.",
107
+ "B. They are techniques to increase the speed of training a machine learning model.",
108
+ "C. They are techniques to increase the complexity of a machine learning model.",
109
+ "D. They are techniques to decrease the complexity of a machine learning model."
110
+ ],
111
+ "explanation": "'L1' and 'L2' are regularization techniques used to prevent overfitting in a machine learning model by adding a penalty term to the loss function. The penalty term encourages the model to have smaller weights, which makes the model simpler and thus less likely to overfit.",
112
+ "answer": "A",
113
+ "level": "Senior",
114
+ "domain": "AI"
115
+ },
116
+ {
117
+ "id": "9",
118
+ "question": "What is 'Ensemble Learning' in the context of Machine Learning?",
119
+ "choices": [
120
+ "A. It's a technique where multiple models are trained to solve the same problem and combined to get better results.",
121
+ "B. It's a technique where one model is trained to solve multiple problems.",
122
+ "C. It's a technique where the model is trained on an ensemble of different datasets.",
123
+ "D. It's a technique where the model is trained multiple times on the same dataset."
124
+ ],
125
+ "explanation": "'Ensemble Learning' is a machine learning paradigm where multiple models (often called 'weak learners') are trained to solve the same problem and combined to get better results. The main hypothesis is that when weak models are correctly combined we can obtain more accurate and/or robust models.",
126
+ "answer": "A",
127
+ "level": "Senior",
128
+ "domain": "AI"
129
+ },
130
+ {
131
+ "id": "10",
132
+ "question": "What is the difference between 'Ridge' and 'Lasso' regression?",
133
+ "choices": [
134
+ "A. 'Ridge' regression uses L1 regularization while 'Lasso' regression uses L2 regularization.",
135
+ "B. 'Lasso' regression uses L1 regularization while 'Ridge' regression uses L2 regularization.",
136
+ "C. Both 'Ridge' and 'Lasso' regression use L1 regularization.",
137
+ "D. Both 'Ridge' and 'Lasso' regression use L2 regularization."
138
+ ],
139
+ "explanation": "'Ridge' and 'Lasso' regression are two types of linear regression models that use different types of regularization. 'Ridge' regression uses L2 regularization, which adds a penalty equal to the square of the magnitude of coefficients. On the other hand, 'Lasso' regression uses L1 regularization, which adds a penalty equal to the absolute value of the magnitude of coefficients.",
140
+ "answer": "B",
141
+ "level": "Senior",
142
+ "domain": "AI"
143
+ },
144
+ {
145
+ "id": "11",
146
+ "question": "What is 'Data Augmentation' in the context of Machine Learning?",
147
+ "choices": [
148
+ "A. It's a technique to artificially create new training data from existing training data.",
149
+ "B. It's a technique to increase the size of the dataset by collecting more data.",
150
+ "C. It's a technique to clean the training data.",
151
+ "D. It's a technique to reduce the size of the dataset."
152
+ ],
153
+ "explanation": "'Data Augmentation' is a strategy that enables practitioners to significantly increase the diversity of data available for training models, without actually collecting new data. Data augmentation techniques such as cropping, padding, and horizontal flipping are commonly used to train large neural networks.",
154
+ "answer": "A",
155
+ "level": "Senior",
156
+ "domain": "AI"
157
+ },
158
+ {
159
+ "id": "12",
160
+ "question": "What is the purpose of 'Max Pooling' in a Convolutional Neural Network (CNN)?",
161
+ "choices": [
162
+ "A. To reduce the spatial dimensions of the output volume.",
163
+ "B. To increase the spatial dimensions of the output volume.",
164
+ "C. To normalize the output of the previous activation layer.",
165
+ "D. To introduce non-linearity in the neural network."
166
+ ],
167
+ "explanation": "'Max Pooling' is a pooling operation that is typically added to CNNs following individual convolutional layers. When added to a model, max pooling reduces the dimensionality of images by reducing the number of pixels in the output from the previous convolutional layer.",
168
+ "answer": "A",
169
+ "level": "Senior",
170
+ "domain": "AI"
171
+ },
172
+ {
173
+ "id": "13",
174
+ "question": "What is the difference between 'Batch Gradient Descent' and 'Mini-Batch Gradient Descent'?",
175
+ "choices": [
176
+ "A. 'Batch Gradient Descent' uses the entire training set to compute the gradient of the cost function, while 'Mini-Batch Gradient Descent' uses a subset of the training set.",
177
+ "B. 'Mini-Batch Gradient Descent' uses the entire training set to compute the gradient of the cost function, while 'Batch Gradient Descent' uses a subset of the training set.",
178
+ "C. Both 'Batch Gradient Descent' and 'Mini-Batch Gradient Descent' use the entire training set to compute the gradient of the cost function.",
179
+ "D. Both 'Batch Gradient Descent' and 'Mini-Batch Gradient Descent' use a subset of the training set to compute the gradient of the cost function."
180
+ ],
181
+ "explanation": "'Batch Gradient Descent' uses the entire training set to compute the gradient of the cost function, while 'Mini-Batch Gradient Descent' uses a subset of the training set. With 'Mini-Batch Gradient Descent', you can replace the actual gradient (calculated from the entire data set) with an estimate of the gradient (calculated from a randomly selected subset of the data). Especially in big data applications, this can help to speed up gradient-based optimization algorithms significantly.",
182
+ "answer": "A",
183
+ "level": "Senior",
184
+ "domain": "AI"
185
+ },
186
+ {
187
+ "id": "14",
188
+ "question": "What is 'Principal Component Analysis' (PCA) used for?",
189
+ "choices": [
190
+ "A. PCA is used to compress the data by reducing the number of dimensions.",
191
+ "B. PCA is used to decompress the data by increasing the number of dimensions.",
192
+ "C. PCA is used to classify the data into different categories.",
193
+ "D. PCA is used to cluster the data into different groups."
194
+ ],
195
+ "explanation": "'Principal Component Analysis' (PCA) is a dimensionality reduction technique that is commonly used in machine learning and data visualization. It can be thought of as a projection method where data with 'm' columns (features) is projected into a subspace with 'm' or fewer columns, whilst retaining the essence of the original data.",
196
+ "answer": "A",
197
+ "level": "Senior",
198
+ "domain": "AI"
199
+ },
200
+ {
201
+ "id": "15",
202
+ "question": "What is the purpose of 'Word Embeddings' in Natural Language Processing (NLP)?",
203
+ "choices": [
204
+ "A. To map words or phrases from the vocabulary to vectors of real numbers.",
205
+ "B. To map words or phrases from the vocabulary to a dictionary of words.",
206
+ "C. To convert the words in the vocabulary to lower case.",
207
+ "D. To remove stop words from the vocabulary."
208
+ ],
209
+ "explanation": "'Word Embeddings' are a type of word representation that allows words with similar meaning to have a similar representation. They are a distributed representation for text that is perhaps one of the key breakthroughs for the impressive performance of deep learning methods on challenging natural language processing problems.",
210
+ "answer": "A",
211
+ "level": "Senior",
212
+ "domain": "AI"
213
+ },
214
+ {
215
+ "id": "16",
216
+ "question": "What is the difference between 'Long Short Term Memory' (LSTM) and 'Gated Recurrent Unit' (GRU)?",
217
+ "choices": [
218
+ "A. LSTM has three gates (input, output, forget) while GRU has two gates (reset, update).",
219
+ "B. GRU has three gates (input, output, forget) while LSTM has two gates (reset, update).",
220
+ "C. Both LSTM and GRU have three gates (input, output, forget).",
221
+ "D. Both LSTM and GRU have two gates (reset, update)."
222
+ ],
223
+ "explanation": "Both LSTM (Long Short Term Memory) and GRU (Gated Recurrent Unit) are types of recurrent neural network (RNN) architecture used in deep learning. The key difference between them is that LSTM has three gates (input, output, forget), while GRU has two gates (reset, update). This makes GRUs a simpler and more efficient model for certain tasks.",
224
+ "answer": "A",
225
+ "level": "Senior",
226
+ "domain": "AI"
227
+ },
228
+ {
229
+ "id": "17",
230
+ "question": "What is 'Autoencoder' in the context of Machine Learning?",
231
+ "choices": [
232
+ "A. It's a type of artificial neural network used for learning efficient codings of input data.",
233
+ "B. It's a type of artificial neural network used for generating new data that is similar to the input data.",
234
+ "C. It's a type of artificial neural network used for classifying input data into different categories.",
235
+ "D. It's a type of artificial neural network used for clustering input data into different groups."
236
+ ],
237
+ "explanation": "An 'Autoencoder' is a type of artificial neural network used for learning efficient codings of input data. It's typically used for the purpose of dimensionality reduction and feature learning.",
238
+ "answer": "A",
239
+ "level": "Senior",
240
+ "domain": "AI"
241
+ },
242
+ {
243
+ "id": "18",
244
+ "question": "What is the purpose of 'Attention Mechanism' in the context of Machine Learning?",
245
+ "choices": [
246
+ "A. It's used to focus on certain parts of the input data that are more relevant to the task at hand.",
247
+ "B. It's used to pay equal attention to all parts of the input data.",
248
+ "C. It's used to ignore certain parts of the input data that are not relevant to the task at hand.",
249
+ "D. Both A and C."
250
+ ],
251
+ "explanation": "The 'Attention Mechanism' is a technique used in machine learning models, especially in deep learning models, to focus on certain parts of the input data that are more relevant to the task at hand, and to ignore other parts. It's particularly useful in tasks such as machine translation, where it's important to focus on the right words in the input sequence when generating the output sequence.",
252
+ "answer": "D",
253
+ "level": "Senior",
254
+ "domain": "AI"
255
+ },
256
+ {
257
+ "id": "19",
258
+ "question": "What is 'Reinforcement Learning' in the context of Machine Learning?",
259
+ "choices": [
260
+ "A. It's a type of machine learning where an agent learns to make decisions by taking actions in an environment to maximize some notion of cumulative reward.",
261
+ "B. It's a type of machine learning where an agent learns to make decisions based on a fixed set of rules.",
262
+ "C. It's a type of machine learning where an agent learns to make decisions based on a predefined set of actions.",
263
+ "D. It's a type of machine learning where an agent learns to make decisions based on the actions taken by other agents."
264
+ ],
265
+ "explanation": "'Reinforcement Learning' is a type of machine learning where an agent learns to make decisions by taking actions in an environment to maximize some notion of cumulative reward. The agent learns from the consequences of its actions, rather than from being explicitly taught and it selects its actions on basis of its past experiences (exploitation) and also by new choices (exploration).",
266
+ "answer": "A",
267
+ "level": "Senior",
268
+ "domain": "AI"
269
+ },
270
+ {
271
+ "id": "20",
272
+ "question": "What is 'Generative Adversarial Network' (GAN) in the context of Machine Learning?",
273
+ "choices": [
274
+ "A. It's a class of machine learning systems invented by Ian Goodfellow and his colleagues in 2014.",
275
+ "B. It's a class of machine learning systems where two neural networks contest with each other in a game.",
276
+ "C. It's a class of machine learning systems where one neural network, called the generator, generates new data instances, while the other, the discriminator, evaluates them for authenticity.",
277
+ "D. All of the above."
278
+ ],
279
+ "explanation": "'Generative Adversarial Network' (GAN) is a class of machine learning systems invented by Ian Goodfellow and his colleagues in 2014. Two neural networks contest with each other in a game. Given a training set, this technique learns to generate new data with the same statistics as the training set. For example, a GAN trained on photographs can generate new photographs that look at least superficially authentic to human observers, having many realistic characteristics.",
280
+ "answer": "D",
281
+ "level": "Senior",
282
+ "domain": "AI"
283
+ },
284
+ {
285
+ "id": "21",
286
+ "question": "Write a Python function to implement a basic 'K-Nearest Neighbors' (KNN) model.",
287
+ "explanation": "The function should take a dataset and a value for 'K' as arguments and return a trained KNN model.",
288
+ "level": "Senior",
289
+ "domain": "AI"
290
+ },
291
+ {
292
+ "id": "22",
293
+ "question": "Write a Python function to implement a basic 'Naive Bayes' model.",
294
+ "explanation": "The function should take a dataset as an argument and return a trained Naive Bayes model.",
295
+ "level": "Senior",
296
+ "domain": "AI"
297
+ },
298
+ {
299
+ "id": "23",
300
+ "question": "Write a Python function to implement a basic 'Random Forest' model.",
301
+ "explanation": "The function should take a dataset as an argument and return a trained Random Forest model.",
302
+ "level": "Senior",
303
+ "domain": "AI"
304
+ },
305
+ {
306
+ "id": "24",
307
+ "question": "Write a Python function to implement a basic 'Gradient Boosting' model.",
308
+ "explanation": "The function should take a dataset as an argument and return a trained Gradient Boosting model.",
309
+ "level": "Senior",
310
+ "domain": "AI"
311
+ },
312
+ {
313
+ "id": "25",
314
+ "question": "Write a Python function to implement a basic 'Deep Neural Network' (DNN) model.",
315
+ "explanation": "The function should take a dataset as an argument and return a trained DNN model.",
316
+ "level": "Senior",
317
+ "domain": "AI"
318
+ },
319
+ {
320
+ "id": "26",
321
+ "question": "Write a Python function to implement a basic 'Convolutional Neural Network' (CNN) model.",
322
+ "explanation": "The function should take a dataset as an argument and return a trained CNN model.",
323
+ "level": "Senior",
324
+ "domain": "AI"
325
+ },
326
+ {
327
+ "id": "27",
328
+ "question": "Write a Python function to implement a basic 'Decision Tree' model.",
329
+ "explanation": "The function should take a dataset as an argument and return a trained decision tree model.",
330
+ "level": "Senior",
331
+ "domain": "AI"
332
+ },
333
+ {
334
+ "id": "28",
335
+ "question": "Write a Python function to implement a basic 'Support Vector Machine' (SVM) model.",
336
+ "explanation": "The function should take a dataset as an argument and return a trained SVM model.",
337
+ "level": "Senior",
338
+ "domain": "AI"
339
+ },
340
+ {
341
+ "id": "29",
342
+ "question": "Write a Python function to implement a basic 'Linear Regression' model.",
343
+ "explanation": "The function should take a dataset as an argument and return a trained linear regression model.",
344
+ "level": "Senior",
345
+ "domain": "AI"
346
+ },
347
+ {
348
+ "id": "30",
349
+ "question": "Write a Python function to implement a basic 'Logistic Regression' model.",
350
+ "explanation": "The function should take a dataset as an argument and return a trained logistic regression model.",
351
+ "level": "Senior",
352
+ "domain": "AI"
353
+ }
354
+ ]
355
+ }
data/test_data/jd_1.txt DELETED
@@ -1,11 +0,0 @@
1
- Senior AI Engineer
2
-
3
- This position focuses on developing algorithms for multilingual conversational systems and improving chatbots using advanced Natural Language Processing (NLP) techniques. Responsibilities include collaborating on NLP applications, conducting research, and optimizing AI models.
4
-
5
- Responsibilities:
6
- • Develop algorithms in multilingual conversational systems
7
- • Implement NLP techniques to enhance text analysis & chatbots understanding of user intent, sentiment analysis, and context-aware responses
8
- • Team up with software engineers to build end-to-end NLP applications
9
- • Conduct research and stay up-to-date with the latest advancements in NLP
10
- • Monitor and analyze the performance of AI models in production, identify and troubleshoot any issues or bottlenecks, and propose solutions for optimization
11
- • Communicating complex analytical findings and recommendations to non-technical stakeholders, including senior management.