Spaces:

MillMin
/

FSA-PROJECT-CV

Build error

App Files Files Community

QuangDinh2102 commited on Mar 12

Commit

850cd75

•

2 Parent(s): bd794f9 94b61d7

phase3/quangdt: Merge from main -> phase3/quangdt

Browse files

Files changed (15) hide show

.github/README.md +8 -0
.github/workflows/huggingface.yaml +21 -0
.gitignore +1 -1
dockerfile → Dockerfile +4 -2
README.md +8 -10
app/modules/crud_cvs/models/crud_cvs.py +19 -0
app/modules/crud_jds/models/crud_jds.py +6 -0
app/modules/matching_cv/__init__.py +41 -19
app/modules/matching_cv/models/matching_cv_logic.py +1 -4
app/modules/question_tests_retrieval/__init__.py +8 -7
app/modules/question_tests_retrieval/models/jd2text.py +2 -2
app/modules/question_tests_retrieval/models/question_tests_logic.py +1 -1
app/modules/question_tests_retrieval/models/text2vector.py +1 -1
scrapping.py +8 -0
tools/crawl_data.py +181 -0

.github/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# Simple Genarates Question Test
+## Tech Stack
+**Client:** Streamlit
+**Server:** FastAPI

.github/workflows/huggingface.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+name: 🤗 Sync to HuggingFace Space
+on:
+  push:
+    branches: [main]
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      # Checkout repo
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      # Push to hub
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push -f https://MillMin:$HF_TOKEN@huggingface.co/spaces/MillMin/FSA-PROJECT-CV main

.gitignore CHANGED Viewed

@@ -5,4 +5,4 @@ credentials/
 data/CV
 data/JD
 data/QUESTION
-!.gitkeep

 data/CV
 data/JD
 data/QUESTION
+!.gitkeep

dockerfile → Dockerfile RENAMED Viewed

@@ -1,4 +1,4 @@
-FROM python:3.10.9
 WORKDIR /code
@@ -8,4 +8,6 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
 COPY . .
-CMD ["python", "main.py"]

+FROM python:3.11.7-alpine3.19
 WORKDIR /code
 COPY . .
+EXPOSE 8000
+CMD python main.py

README.md CHANGED Viewed

@@ -1,10 +1,8 @@
-# Simple Genarates Question Test
-## Tech Stack
-**Client:** Streamlit
-**Server:** FastAPI

+---
+title: FSA PROJECT CV
+emoji: 🏢
+colorFrom: green
+colorTo: red
+sdk: docker
+pinned: false
+---

app/modules/crud_cvs/models/crud_cvs.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import uuid
 from app.configs.database import firebase_bucket, firebase_db
 # CRUD operation
 def upload_file_cvs(file):
@@ -16,6 +18,23 @@ def remove_file_cvs(file_url):
     blob.delete()
     return True
 def get_all_cvs():
     # Get all documents from the collection
     docs = firebase_db.collection("cvs").stream()

 import uuid
 from app.configs.database import firebase_bucket, firebase_db
+import io
+from docx import Document
 # CRUD operation
 def upload_file_cvs(file):
     blob.delete()
     return True
+def file_cv_doc2text(file_url):
+    # download file from firebase storage using "gs://" link
+    blob = firebase_bucket.blob(file_url.split(f"gs://{firebase_bucket.name}/")[1])
+    # download file and return string in file
+    file_bytes = blob.download_as_bytes()
+    # Create a BytesIO object from the file bytes
+    file_stream = io.BytesIO(file_bytes)
+    # Read the .docx file from the BytesIO object
+    doc = Document(file_stream)
+    # Extract text from the .docx file
+    text = ""
+    for paragraph in doc.paragraphs:
+        text += paragraph.text + "\n"
+    return text
 def get_all_cvs():
     # Get all documents from the collection
     docs = firebase_db.collection("cvs").stream()

app/modules/crud_jds/models/crud_jds.py CHANGED Viewed

@@ -17,6 +17,12 @@ def remove_file_jds(file_url):
     blob.delete()
     return True
 def get_all_jds():
     # Get all documents from the collection
     docs = firebase_db.collection("jds").stream()

     blob.delete()
     return True
+def file_jd_txt2text(file_url):
+    # download file from firebase storage using "gs://" link
+    blob = firebase_bucket.blob(file_url.split(f"gs://{firebase_bucket.name}/")[1])
+    # download file and return string in file
+    return blob.download_as_text()
 def get_all_jds():
     # Get all documents from the collection
     docs = firebase_db.collection("jds").stream()

app/modules/matching_cv/__init__.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import docx
 from fastapi import APIRouter
-from app.modules.matching_cv.models.match_cv_jd_model import Match_JD_CV_Model
-from app.modules.matching_cv.models.matching_cv_logic import result_matching_cv_jd, load_jd_from_id
 cvmatching_router = APIRouter(prefix="/cvmatching", tags=["cvmatching"])
@@ -13,24 +15,44 @@ async def index():
 @cvmatching_router.post("/matching")
 # only upload .pdf or .docx file
-async def matching_cv_jd(
-    jd_upload: Match_JD_CV_Model.jd = Match_JD_CV_Model.jd_default,
-    cv_upload: Match_JD_CV_Model.cv = Match_JD_CV_Model.cv_default):
     try:
-        # take jd_upload and cv_upload type file
-        jd_upload_type = jd_upload.filename.split(".")[-1]
-        cv_upload_type = cv_upload.filename.split(".")[-1]
-        if jd_upload_type in ["txt"] and cv_upload_type in ["pdf", "docx"]:
-            jd_text =  jd_upload.file.read().decode("utf-8")
-            if cv_upload_type == "docx":
-                cv_text = docx.Document(cv_upload.file).paragraphs
-                cv_text = "\n".join([i.text for i in cv_text])
-            elif cv_upload_type == "pdf":
-                return {"message": "This feature is not available yet"}
-            # check matching cv and jd
-            result = result_matching_cv_jd(cv_text, jd_text)
-            return {"result": result}
         else:
-            return {"message": "Please upload only .txt for JD. And .pdf or .docx file for CV"}
     except Exception as e:
         return {"Error": str(e)}

 import docx
 from fastapi import APIRouter
+# from app.modules.matching_cv.models.match_cv_jd_model import Match_JD_CV_Model
+from app.modules.matching_cv.models.matching_cv_logic import result_matching_cv_jd
+from app.modules.crud_jds.models.crud_jds import get_jd_by_id, file_jd_txt2text
+from app.modules.crud_cvs.models.crud_cvs import get_cv_by_id, file_cv_doc2text
 cvmatching_router = APIRouter(prefix="/cvmatching", tags=["cvmatching"])
 @cvmatching_router.post("/matching")
 # only upload .pdf or .docx file
+async def matching_cv_jd(id_jd: str, id_cv:str):
     try:
+        # get jd and cv by id
+        jd_document = get_jd_by_id(id_jd)
+        cv_document = get_cv_by_id(id_cv)
+        # download file from firebase storage
+        jd_url = jd_document["jd_url"]
+        cv_url = cv_document["cv_url"]
+        # get type file cv from cv_url "gs://bucket_name/file_name"
+        cv_type = cv_url.split(".")[-1]
+        if cv_type == "pdf":
+            return {"message": "This feature is not available yet"}
+        elif cv_type == "docx":
+            cv_text = file_cv_doc2text(cv_url)
         else:
+            return {"message": "Please upload only .pdf or .docx file for CV"}
+        # get jd_text from jd_url "gs://bucket_name/file_name"
+        jd_text = file_jd_txt2text(jd_url)
+        result = result_matching_cv_jd(cv_text, jd_text)
+        return {"result": result}
+        # # take jd_upload and cv_upload type file
+        # jd_upload_type = jd_upload.filename.split(".")[-1]
+        # cv_upload_type = cv_upload.filename.split(".")[-1]
+        # if jd_upload_type in ["txt"] and cv_upload_type in ["pdf", "docx"]:
+        #     jd_text =  jd_upload.file.read().decode("utf-8")
+        #     if cv_upload_type == "docx":
+        #         cv_text = docx.Document(cv_upload.file).paragraphs
+        #         cv_text = "\n".join([i.text for i in cv_text])
+        #     elif cv_upload_type == "pdf":
+        #         return {"message": "This feature is not available yet"}
+        #     # check matching cv and jd
+        #     result = result_matching_cv_jd(cv_text, jd_text)
+        #     return {"result": result}
+        # else:
+        #     return {"message": "Please upload only .txt for JD. And .pdf or .docx file for CV"}
     except Exception as e:
         return {"Error": str(e)}

app/modules/matching_cv/models/matching_cv_logic.py CHANGED Viewed

@@ -42,11 +42,8 @@ def result_matching_cv_jd(cv_text, jd_text):
     # create the chat message
     chat_message =  chat_template.format_messages(cv=cv_text, jd=jd_text)
-    llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY)
     chain = llm | parser
     result = chain.invoke(chat_message)
     return result
-def load_jd_from_id():
-    pass

     # create the chat message
     chat_message =  chat_template.format_messages(cv=cv_text, jd=jd_text)
+    llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY, request_timeout=120)
     chain = llm | parser
     result = chain.invoke(chat_message)
     return result

app/modules/question_tests_retrieval/__init__.py CHANGED Viewed

@@ -2,8 +2,8 @@ from fastapi import APIRouter, UploadFile, File
 from typing import Annotated
 from app.modules.question_tests_retrieval.models.jd2text import jobdes2text
-# from app.modules.question_tests_retrieval.models.text2vector import text2vector
 from app.modules.question_tests_retrieval.models.question_tests_logic import get_question_tests
 qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
@@ -11,18 +11,19 @@ qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
 async def index():
     return {"message": "Welcome to question retrieval page"}
-@qtretrieval_router.post("/send_jd")
 # only upload .txt file
-async def send_jd(txt_file: Annotated[UploadFile, File(..., description="The JD file (only .txt file)", media_type=["text/plain"])]):
     try:
-        # read the txt file with format
-        jobdes = txt_file.file.read().decode("utf-8")
-        sumaryjd_text = jobdes2text(jobdes)
         if get_question_tests(sumaryjd_text):
             return {"message": "Send JD successfully and get question test successfully",
                     "sumaryjd_text": sumaryjd_text}
         else:
             return {"message": "Please upload only .txt file", "error": str(e)}
     except Exception as e:
-        return {"message": "Please upload only .txt file", "error": str(e)}

 from typing import Annotated
 from app.modules.question_tests_retrieval.models.jd2text import jobdes2text
 from app.modules.question_tests_retrieval.models.question_tests_logic import get_question_tests
+from app.modules.crud_jds.models.crud_jds import get_jd_by_id, file_jd_txt2text
 qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
 async def index():
     return {"message": "Welcome to question retrieval page"}
+@qtretrieval_router.post("/send_jd_to_get_question")
 # only upload .txt file
+async def send_jd_to_get_question(id_jd: str):
     try:
+        jd_document = get_jd_by_id(id_jd)
+        # download jd file from firebase storage
+        jd_file_string = file_jd_txt2text(jd_document["jd_url"])
+        sumaryjd_text = jobdes2text(jd_file_string)
         if get_question_tests(sumaryjd_text):
             return {"message": "Send JD successfully and get question test successfully",
                     "sumaryjd_text": sumaryjd_text}
         else:
             return {"message": "Please upload only .txt file", "error": str(e)}
     except Exception as e:
+        return {"message": "Have error when find JD in database", "error": str(e)}

app/modules/question_tests_retrieval/models/jd2text.py CHANGED Viewed

@@ -17,7 +17,7 @@ parser = JsonOutputParser()
 def jobdes2text(jobdes: str) -> str:
     # setup the gemini pro
-    llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY)
     # create the prompt template
     finnal_jd_chat_template = ChatPromptTemplate.from_messages(
@@ -27,7 +27,7 @@ def jobdes2text(jobdes: str) -> str:
                     """Return Job title, level(Fresher, Junior, Senior, ...) and Brief summary of required skills about 20 words from the job description. Use the following format: Job Title is {job title}, Level is {level}, and Brief summary of required skills is {brief summary of required skills}."""
                 )
             ),
-            HumanMessagePromptTemplate.from_template("{text}"),
         ]
     )

 def jobdes2text(jobdes: str) -> str:
     # setup the gemini pro
+    llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY, request_timeout=120)
     # create the prompt template
     finnal_jd_chat_template = ChatPromptTemplate.from_messages(
                     """Return Job title, level(Fresher, Junior, Senior, ...) and Brief summary of required skills about 20 words from the job description. Use the following format: Job Title is {job title}, Level is {level}, and Brief summary of required skills is {brief summary of required skills}."""
                 )
             ),
+            HumanMessagePromptTemplate.from_template("{text}"),
         ]
     )

app/modules/question_tests_retrieval/models/question_tests_logic.py CHANGED Viewed

@@ -18,7 +18,7 @@ os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
 # Setting model embedding
-embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
 gemini_evaluator = load_evaluator("embedding_distance", distance_metric=EmbeddingDistance.COSINE, embeddings=embedding_model)
 # def compare_vector(vector_extract, vector_des):

 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
 # Setting model embedding
+embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY, request_timeout=120)
 gemini_evaluator = load_evaluator("embedding_distance", distance_metric=EmbeddingDistance.COSINE, embeddings=embedding_model)
 # def compare_vector(vector_extract, vector_des):

app/modules/question_tests_retrieval/models/text2vector.py CHANGED Viewed

@@ -10,6 +10,6 @@ os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
 def text2vector(text):
-    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
     vector = embeddings.embed_query(text)
     return vector

 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
 def text2vector(text):
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY, request_timeout=120)
     vector = embeddings.embed_query(text)
     return vector

scrapping.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from bs4 import BeautifulSoup
+import requests
+url = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue"
+page = requests.get(url)
+soup = BeautifulSoup(page.text, 'html')
+print(soup)

tools/crawl_data.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# import requests
+# from bs4 import BeautifulSoup
+# url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'  # Replace with the URL you intend to scrape
+# response = requests.get(url)
+# soup = BeautifulSoup(response.text, 'html.parser')
+# # Example of extracting all paragraph texts
+# paragraphs = soup.find_all('p')
+# for paragraph in paragraphs:
+#     print(paragraph.text)
+# # Extract all text from the body of the HTML document
+# text = soup.body.get_text(separator=' ', strip=True)
+# print(text)
+#2
+# import requests
+# from bs4 import BeautifulSoup
+# # List of URLs to scrape
+# urls = [
+#     'https://vietnix.vn/java-la-gi/', 'https://200lab.io/blog/python-la-gi/'
+#     # Add more URLs as needed
+# ]
+# for url in urls:
+#     response = requests.get(url)
+#     soup = BeautifulSoup(response.text, 'html.parser')
+#     # Extract and print all paragraph texts for each URL
+#     paragraphs = soup.find_all('p')
+#     print(f'Content from {url}:')
+#     for paragraph in paragraphs:
+#         print(paragraph.text)
+#     print("\n")  # Print a new line for better readability between different URLs
+#     # Extract all text from the body of the HTML document for each URL
+#     text = soup.body.get_text(separator=' ', strip=True)
+#     print(f'Full text from {url}:')
+#     print(text)
+#     print("="*100)  # Print a separator line for better readability between different URLs
+# 4 add save file
+# import requests
+# from bs4 import BeautifulSoup
+# import os
+# # List of URLs to scrape
+# urls = [
+#     'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue',
+#     # Add more URLs as needed
+# ]
+# for url in urls:
+#     response = requests.get(url)
+#     soup = BeautifulSoup(response.text, 'html.parser')
+#     # Extracting base name of the URL to use as the filename
+#     filename = os.path.basename(url).replace('%', '_').replace('?', '_') + '.txt'
+#     # Open a new text file for writing the scraped data
+#     with open(filename, 'w', encoding='utf-8') as file:
+#         # Write the URL to the file
+#         file.write(f'Content from {url}:\n')
+#         # Extract and write all paragraph texts for each URL
+#         paragraphs = soup.find_all('p')
+#         for paragraph in paragraphs:
+#             file.write(paragraph.text + '\n')
+#         file.write("\n")  # Write a new line for better readability between different URLs
+#         # Extract and write all text from the body of the HTML document for each URL
+#         text = soup.body.get_text(separator=' ', strip=True)
+#         file.write(f'Full text from {url}:\n')
+#         file.write(text + '\n')
+#         file.write("="*100 + '\n')  # Write a separator line for better readability between different URLs
+#     # Print out a message to let you know the data has been written to the file
+#     print(f'Scraped data from {url} has been saved to {filename}')
+#5 It has internal link scrapping
+# import requests
+# from bs4 import BeautifulSoup
+# import os
+# # Initial list of main URLs to scan
+# main_urls = [
+#     'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
+#     # Add more main URLs as needed
+# ]
+# # Function to get all unique links from a given URL
+# def get_all_links(url):
+#     response = requests.get(url)
+#     soup = BeautifulSoup(response.text, 'html.parser')
+#     links = soup.find_all('a')
+#     unique_links = set()
+#     for link in links:
+#         href = link.get('href')
+#         if href and href.startswith('/wiki/'):  # Filters out unwanted links and keeps wikipedia internal links
+#             complete_link = f"https://en.wikipedia.org{href}"
+#             unique_links.add(complete_link)
+#     return list(unique_links)
+# # Iterate over main URLs to get all specific links and scrape data from each
+# for main_url in main_urls:
+#     urls = get_all_links(main_url)  # Get all sub-links from the main URL
+#     for url in urls:
+#         response = requests.get(url)
+#         soup = BeautifulSoup(response.text, 'html.parser')
+#         # Extracting base name of the URL to use as the filename
+#         filename = os.path.basename(url).split('#')[0]  # Remove URL fragments
+#         filename = filename.replace('%', '_').replace('?', '_') + '.txt'  # Replace special characters
+#         # Open a new text file for writing the scraped data
+#         with open(filename, 'w', encoding='utf-8') as file:
+#             # Write the URL to the file
+#             file.write(f'Content from {url}:\n\n')
+#             # Extract and write all paragraph texts for each URL
+#             paragraphs = soup.find_all('p')
+#             for paragraph in paragraphs:
+#                 file.write(paragraph.text + '\n\n')
+#             file.write("="*100 + '\n')  # Write a separator line for better readability
+#         # Print out a message to let you know the data has been written to the file
+#         print(f'Scraped data from {url} has been saved to {filename}')
+import requests
+from bs4 import BeautifulSoup
+import os
+# Initial list of main URLs to scan
+main_urls = [
+    'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
+    # Add more main URLs as needed
+]
+# Function to get all unique links from a given URL
+def get_all_links(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    links = soup.find_all('a')
+    unique_links = set()
+    for link in links:
+        href = link.get('href')
+        if href and not href.startswith('#') and not href.startswith('mailto:'):  # Filters out unwanted links like anchors and emails
+            if not href.startswith('http'):  # Check if the link is relative
+                href = url + href  # Construct the complete URL
+            unique_links.add(href)
+    return list(unique_links)
+# Iterate over main URLs to get all specific links and scrape data from each
+for main_url in main_urls:
+    urls = get_all_links(main_url)  # Get all sub-links from the main URL
+    for url in urls:
+        response = requests.get(url)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Extracting base name of the URL to use as the filename
+        filename = os.path.basename(url).split('#')[0]  # Remove URL fragments
+        filename = filename.replace('%', '_').replace('?', '_') + '.txt'  # Replace special characters
+        # Open a new text file for writing the scraped data
+        with open(filename, 'w', encoding='utf-8') as file:
+            # Write the URL to the file
+            file.write(f'Content from {url}:\n\n')
+            # Extract and write all paragraph texts for each URL
+            paragraphs = soup.find_all('p')
+            for paragraph in paragraphs:
+                file.write(paragraph.text + '\n\n')
+            file.write("="*100 + '\n')  # Write a separator line for better readability
+        # Print out a message to let you know the data has been written to the file
+        print(f'Scraped data from {url} has been saved to {filename}')