Spaces:
Build error
Build error
phase3/quangdt: Merge from main -> phase3/quangdt
Browse files- .github/README.md +8 -0
- .github/workflows/huggingface.yaml +21 -0
- .gitignore +1 -1
- dockerfile → Dockerfile +4 -2
- README.md +8 -10
- app/modules/crud_cvs/models/crud_cvs.py +19 -0
- app/modules/crud_jds/models/crud_jds.py +6 -0
- app/modules/matching_cv/__init__.py +41 -19
- app/modules/matching_cv/models/matching_cv_logic.py +1 -4
- app/modules/question_tests_retrieval/__init__.py +8 -7
- app/modules/question_tests_retrieval/models/jd2text.py +2 -2
- app/modules/question_tests_retrieval/models/question_tests_logic.py +1 -1
- app/modules/question_tests_retrieval/models/text2vector.py +1 -1
- scrapping.py +8 -0
- tools/crawl_data.py +181 -0
.github/README.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Simple Genarates Question Test
|
2 |
+
|
3 |
+
|
4 |
+
## Tech Stack
|
5 |
+
|
6 |
+
**Client:** Streamlit
|
7 |
+
|
8 |
+
**Server:** FastAPI
|
.github/workflows/huggingface.yaml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: 🤗 Sync to HuggingFace Space
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches: [main]
|
6 |
+
|
7 |
+
jobs:
|
8 |
+
sync-to-hub:
|
9 |
+
runs-on: ubuntu-latest
|
10 |
+
steps:
|
11 |
+
# Checkout repo
|
12 |
+
- uses: actions/checkout@v3
|
13 |
+
with:
|
14 |
+
fetch-depth: 0
|
15 |
+
lfs: true
|
16 |
+
|
17 |
+
# Push to hub
|
18 |
+
- name: Push to hub
|
19 |
+
env:
|
20 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
21 |
+
run: git push -f https://MillMin:$HF_TOKEN@huggingface.co/spaces/MillMin/FSA-PROJECT-CV main
|
.gitignore
CHANGED
@@ -5,4 +5,4 @@ credentials/
|
|
5 |
data/CV
|
6 |
data/JD
|
7 |
data/QUESTION
|
8 |
-
!.gitkeep
|
|
|
5 |
data/CV
|
6 |
data/JD
|
7 |
data/QUESTION
|
8 |
+
!.gitkeep
|
dockerfile → Dockerfile
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
FROM python:3.
|
2 |
|
3 |
WORKDIR /code
|
4 |
|
@@ -8,4 +8,6 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
|
8 |
|
9 |
COPY . .
|
10 |
|
11 |
-
|
|
|
|
|
|
1 |
+
FROM python:3.11.7-alpine3.19
|
2 |
|
3 |
WORKDIR /code
|
4 |
|
|
|
8 |
|
9 |
COPY . .
|
10 |
|
11 |
+
EXPOSE 8000
|
12 |
+
|
13 |
+
CMD python main.py
|
README.md
CHANGED
@@ -1,10 +1,8 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
**Server:** FastAPI
|
10 |
-
|
|
|
1 |
+
---
|
2 |
+
title: FSA PROJECT CV
|
3 |
+
emoji: 🏢
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: red
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
---
|
|
|
|
app/modules/crud_cvs/models/crud_cvs.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import uuid
|
2 |
from app.configs.database import firebase_bucket, firebase_db
|
|
|
|
|
3 |
|
4 |
# CRUD operation
|
5 |
def upload_file_cvs(file):
|
@@ -16,6 +18,23 @@ def remove_file_cvs(file_url):
|
|
16 |
blob.delete()
|
17 |
return True
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def get_all_cvs():
|
20 |
# Get all documents from the collection
|
21 |
docs = firebase_db.collection("cvs").stream()
|
|
|
1 |
import uuid
|
2 |
from app.configs.database import firebase_bucket, firebase_db
|
3 |
+
import io
|
4 |
+
from docx import Document
|
5 |
|
6 |
# CRUD operation
|
7 |
def upload_file_cvs(file):
|
|
|
18 |
blob.delete()
|
19 |
return True
|
20 |
|
21 |
+
def file_cv_doc2text(file_url):
|
22 |
+
# download file from firebase storage using "gs://" link
|
23 |
+
blob = firebase_bucket.blob(file_url.split(f"gs://{firebase_bucket.name}/")[1])
|
24 |
+
# download file and return string in file
|
25 |
+
file_bytes = blob.download_as_bytes()
|
26 |
+
# Create a BytesIO object from the file bytes
|
27 |
+
file_stream = io.BytesIO(file_bytes)
|
28 |
+
# Read the .docx file from the BytesIO object
|
29 |
+
doc = Document(file_stream)
|
30 |
+
# Extract text from the .docx file
|
31 |
+
text = ""
|
32 |
+
for paragraph in doc.paragraphs:
|
33 |
+
text += paragraph.text + "\n"
|
34 |
+
|
35 |
+
return text
|
36 |
+
|
37 |
+
|
38 |
def get_all_cvs():
|
39 |
# Get all documents from the collection
|
40 |
docs = firebase_db.collection("cvs").stream()
|
app/modules/crud_jds/models/crud_jds.py
CHANGED
@@ -17,6 +17,12 @@ def remove_file_jds(file_url):
|
|
17 |
blob.delete()
|
18 |
return True
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
def get_all_jds():
|
21 |
# Get all documents from the collection
|
22 |
docs = firebase_db.collection("jds").stream()
|
|
|
17 |
blob.delete()
|
18 |
return True
|
19 |
|
20 |
+
def file_jd_txt2text(file_url):
|
21 |
+
# download file from firebase storage using "gs://" link
|
22 |
+
blob = firebase_bucket.blob(file_url.split(f"gs://{firebase_bucket.name}/")[1])
|
23 |
+
# download file and return string in file
|
24 |
+
return blob.download_as_text()
|
25 |
+
|
26 |
def get_all_jds():
|
27 |
# Get all documents from the collection
|
28 |
docs = firebase_db.collection("jds").stream()
|
app/modules/matching_cv/__init__.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
import docx
|
2 |
|
3 |
from fastapi import APIRouter
|
4 |
-
from app.modules.matching_cv.models.match_cv_jd_model import Match_JD_CV_Model
|
5 |
|
6 |
-
from app.modules.matching_cv.models.matching_cv_logic import result_matching_cv_jd
|
|
|
|
|
7 |
|
8 |
cvmatching_router = APIRouter(prefix="/cvmatching", tags=["cvmatching"])
|
9 |
|
@@ -13,24 +15,44 @@ async def index():
|
|
13 |
|
14 |
@cvmatching_router.post("/matching")
|
15 |
# only upload .pdf or .docx file
|
16 |
-
async def matching_cv_jd(
|
17 |
-
jd_upload: Match_JD_CV_Model.jd = Match_JD_CV_Model.jd_default,
|
18 |
-
cv_upload: Match_JD_CV_Model.cv = Match_JD_CV_Model.cv_default):
|
19 |
try:
|
20 |
-
#
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
else:
|
34 |
-
return {"message": "Please upload only .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
except Exception as e:
|
36 |
return {"Error": str(e)}
|
|
|
1 |
import docx
|
2 |
|
3 |
from fastapi import APIRouter
|
4 |
+
# from app.modules.matching_cv.models.match_cv_jd_model import Match_JD_CV_Model
|
5 |
|
6 |
+
from app.modules.matching_cv.models.matching_cv_logic import result_matching_cv_jd
|
7 |
+
from app.modules.crud_jds.models.crud_jds import get_jd_by_id, file_jd_txt2text
|
8 |
+
from app.modules.crud_cvs.models.crud_cvs import get_cv_by_id, file_cv_doc2text
|
9 |
|
10 |
cvmatching_router = APIRouter(prefix="/cvmatching", tags=["cvmatching"])
|
11 |
|
|
|
15 |
|
16 |
@cvmatching_router.post("/matching")
|
17 |
# only upload .pdf or .docx file
|
18 |
+
async def matching_cv_jd(id_jd: str, id_cv:str):
|
|
|
|
|
19 |
try:
|
20 |
+
# get jd and cv by id
|
21 |
+
jd_document = get_jd_by_id(id_jd)
|
22 |
+
cv_document = get_cv_by_id(id_cv)
|
23 |
+
|
24 |
+
# download file from firebase storage
|
25 |
+
jd_url = jd_document["jd_url"]
|
26 |
+
cv_url = cv_document["cv_url"]
|
27 |
+
|
28 |
+
# get type file cv from cv_url "gs://bucket_name/file_name"
|
29 |
+
cv_type = cv_url.split(".")[-1]
|
30 |
+
if cv_type == "pdf":
|
31 |
+
return {"message": "This feature is not available yet"}
|
32 |
+
elif cv_type == "docx":
|
33 |
+
cv_text = file_cv_doc2text(cv_url)
|
34 |
else:
|
35 |
+
return {"message": "Please upload only .pdf or .docx file for CV"}
|
36 |
+
|
37 |
+
# get jd_text from jd_url "gs://bucket_name/file_name"
|
38 |
+
jd_text = file_jd_txt2text(jd_url)
|
39 |
+
|
40 |
+
result = result_matching_cv_jd(cv_text, jd_text)
|
41 |
+
return {"result": result}
|
42 |
+
# # take jd_upload and cv_upload type file
|
43 |
+
# jd_upload_type = jd_upload.filename.split(".")[-1]
|
44 |
+
# cv_upload_type = cv_upload.filename.split(".")[-1]
|
45 |
+
# if jd_upload_type in ["txt"] and cv_upload_type in ["pdf", "docx"]:
|
46 |
+
# jd_text = jd_upload.file.read().decode("utf-8")
|
47 |
+
# if cv_upload_type == "docx":
|
48 |
+
# cv_text = docx.Document(cv_upload.file).paragraphs
|
49 |
+
# cv_text = "\n".join([i.text for i in cv_text])
|
50 |
+
# elif cv_upload_type == "pdf":
|
51 |
+
# return {"message": "This feature is not available yet"}
|
52 |
+
# # check matching cv and jd
|
53 |
+
# result = result_matching_cv_jd(cv_text, jd_text)
|
54 |
+
# return {"result": result}
|
55 |
+
# else:
|
56 |
+
# return {"message": "Please upload only .txt for JD. And .pdf or .docx file for CV"}
|
57 |
except Exception as e:
|
58 |
return {"Error": str(e)}
|
app/modules/matching_cv/models/matching_cv_logic.py
CHANGED
@@ -42,11 +42,8 @@ def result_matching_cv_jd(cv_text, jd_text):
|
|
42 |
# create the chat message
|
43 |
chat_message = chat_template.format_messages(cv=cv_text, jd=jd_text)
|
44 |
|
45 |
-
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY)
|
46 |
chain = llm | parser
|
47 |
result = chain.invoke(chat_message)
|
48 |
|
49 |
return result
|
50 |
-
|
51 |
-
def load_jd_from_id():
|
52 |
-
pass
|
|
|
42 |
# create the chat message
|
43 |
chat_message = chat_template.format_messages(cv=cv_text, jd=jd_text)
|
44 |
|
45 |
+
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY, request_timeout=120)
|
46 |
chain = llm | parser
|
47 |
result = chain.invoke(chat_message)
|
48 |
|
49 |
return result
|
|
|
|
|
|
app/modules/question_tests_retrieval/__init__.py
CHANGED
@@ -2,8 +2,8 @@ from fastapi import APIRouter, UploadFile, File
|
|
2 |
from typing import Annotated
|
3 |
|
4 |
from app.modules.question_tests_retrieval.models.jd2text import jobdes2text
|
5 |
-
# from app.modules.question_tests_retrieval.models.text2vector import text2vector
|
6 |
from app.modules.question_tests_retrieval.models.question_tests_logic import get_question_tests
|
|
|
7 |
|
8 |
qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
|
9 |
|
@@ -11,18 +11,19 @@ qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
|
|
11 |
async def index():
|
12 |
return {"message": "Welcome to question retrieval page"}
|
13 |
|
14 |
-
@qtretrieval_router.post("/
|
15 |
# only upload .txt file
|
16 |
-
async def
|
17 |
try:
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
21 |
if get_question_tests(sumaryjd_text):
|
22 |
return {"message": "Send JD successfully and get question test successfully",
|
23 |
"sumaryjd_text": sumaryjd_text}
|
24 |
else:
|
25 |
return {"message": "Please upload only .txt file", "error": str(e)}
|
26 |
except Exception as e:
|
27 |
-
return {"message": "
|
28 |
|
|
|
2 |
from typing import Annotated
|
3 |
|
4 |
from app.modules.question_tests_retrieval.models.jd2text import jobdes2text
|
|
|
5 |
from app.modules.question_tests_retrieval.models.question_tests_logic import get_question_tests
|
6 |
+
from app.modules.crud_jds.models.crud_jds import get_jd_by_id, file_jd_txt2text
|
7 |
|
8 |
qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
|
9 |
|
|
|
11 |
async def index():
|
12 |
return {"message": "Welcome to question retrieval page"}
|
13 |
|
14 |
+
@qtretrieval_router.post("/send_jd_to_get_question")
|
15 |
# only upload .txt file
|
16 |
+
async def send_jd_to_get_question(id_jd: str):
|
17 |
try:
|
18 |
+
jd_document = get_jd_by_id(id_jd)
|
19 |
+
# download jd file from firebase storage
|
20 |
+
jd_file_string = file_jd_txt2text(jd_document["jd_url"])
|
21 |
+
sumaryjd_text = jobdes2text(jd_file_string)
|
22 |
if get_question_tests(sumaryjd_text):
|
23 |
return {"message": "Send JD successfully and get question test successfully",
|
24 |
"sumaryjd_text": sumaryjd_text}
|
25 |
else:
|
26 |
return {"message": "Please upload only .txt file", "error": str(e)}
|
27 |
except Exception as e:
|
28 |
+
return {"message": "Have error when find JD in database", "error": str(e)}
|
29 |
|
app/modules/question_tests_retrieval/models/jd2text.py
CHANGED
@@ -17,7 +17,7 @@ parser = JsonOutputParser()
|
|
17 |
|
18 |
def jobdes2text(jobdes: str) -> str:
|
19 |
# setup the gemini pro
|
20 |
-
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY)
|
21 |
|
22 |
# create the prompt template
|
23 |
finnal_jd_chat_template = ChatPromptTemplate.from_messages(
|
@@ -27,7 +27,7 @@ def jobdes2text(jobdes: str) -> str:
|
|
27 |
"""Return Job title, level(Fresher, Junior, Senior, ...) and Brief summary of required skills about 20 words from the job description. Use the following format: Job Title is {job title}, Level is {level}, and Brief summary of required skills is {brief summary of required skills}."""
|
28 |
)
|
29 |
),
|
30 |
-
HumanMessagePromptTemplate.from_template("{text}"),
|
31 |
]
|
32 |
)
|
33 |
|
|
|
17 |
|
18 |
def jobdes2text(jobdes: str) -> str:
|
19 |
# setup the gemini pro
|
20 |
+
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY, request_timeout=120)
|
21 |
|
22 |
# create the prompt template
|
23 |
finnal_jd_chat_template = ChatPromptTemplate.from_messages(
|
|
|
27 |
"""Return Job title, level(Fresher, Junior, Senior, ...) and Brief summary of required skills about 20 words from the job description. Use the following format: Job Title is {job title}, Level is {level}, and Brief summary of required skills is {brief summary of required skills}."""
|
28 |
)
|
29 |
),
|
30 |
+
HumanMessagePromptTemplate.from_template("{text}"),
|
31 |
]
|
32 |
)
|
33 |
|
app/modules/question_tests_retrieval/models/question_tests_logic.py
CHANGED
@@ -18,7 +18,7 @@ os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
|
|
18 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
19 |
|
20 |
# Setting model embedding
|
21 |
-
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
|
22 |
gemini_evaluator = load_evaluator("embedding_distance", distance_metric=EmbeddingDistance.COSINE, embeddings=embedding_model)
|
23 |
|
24 |
# def compare_vector(vector_extract, vector_des):
|
|
|
18 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
19 |
|
20 |
# Setting model embedding
|
21 |
+
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY, request_timeout=120)
|
22 |
gemini_evaluator = load_evaluator("embedding_distance", distance_metric=EmbeddingDistance.COSINE, embeddings=embedding_model)
|
23 |
|
24 |
# def compare_vector(vector_extract, vector_des):
|
app/modules/question_tests_retrieval/models/text2vector.py
CHANGED
@@ -10,6 +10,6 @@ os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
|
|
10 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
11 |
|
12 |
def text2vector(text):
|
13 |
-
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
|
14 |
vector = embeddings.embed_query(text)
|
15 |
return vector
|
|
|
10 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
11 |
|
12 |
def text2vector(text):
|
13 |
+
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY, request_timeout=120)
|
14 |
vector = embeddings.embed_query(text)
|
15 |
return vector
|
scrapping.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
import requests
|
3 |
+
|
4 |
+
url = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue"
|
5 |
+
page = requests.get(url)
|
6 |
+
soup = BeautifulSoup(page.text, 'html')
|
7 |
+
|
8 |
+
print(soup)
|
tools/crawl_data.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import requests
|
2 |
+
# from bs4 import BeautifulSoup
|
3 |
+
|
4 |
+
# url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue' # Replace with the URL you intend to scrape
|
5 |
+
# response = requests.get(url)
|
6 |
+
# soup = BeautifulSoup(response.text, 'html.parser')
|
7 |
+
|
8 |
+
# # Example of extracting all paragraph texts
|
9 |
+
# paragraphs = soup.find_all('p')
|
10 |
+
# for paragraph in paragraphs:
|
11 |
+
# print(paragraph.text)
|
12 |
+
|
13 |
+
|
14 |
+
# # Extract all text from the body of the HTML document
|
15 |
+
# text = soup.body.get_text(separator=' ', strip=True)
|
16 |
+
# print(text)
|
17 |
+
|
18 |
+
#2
|
19 |
+
|
20 |
+
# import requests
|
21 |
+
# from bs4 import BeautifulSoup
|
22 |
+
|
23 |
+
# # List of URLs to scrape
|
24 |
+
# urls = [
|
25 |
+
# 'https://vietnix.vn/java-la-gi/', 'https://200lab.io/blog/python-la-gi/'
|
26 |
+
# # Add more URLs as needed
|
27 |
+
# ]
|
28 |
+
|
29 |
+
# for url in urls:
|
30 |
+
# response = requests.get(url)
|
31 |
+
# soup = BeautifulSoup(response.text, 'html.parser')
|
32 |
+
|
33 |
+
# # Extract and print all paragraph texts for each URL
|
34 |
+
# paragraphs = soup.find_all('p')
|
35 |
+
# print(f'Content from {url}:')
|
36 |
+
# for paragraph in paragraphs:
|
37 |
+
# print(paragraph.text)
|
38 |
+
# print("\n") # Print a new line for better readability between different URLs
|
39 |
+
|
40 |
+
# # Extract all text from the body of the HTML document for each URL
|
41 |
+
# text = soup.body.get_text(separator=' ', strip=True)
|
42 |
+
# print(f'Full text from {url}:')
|
43 |
+
# print(text)
|
44 |
+
# print("="*100) # Print a separator line for better readability between different URLs
|
45 |
+
|
46 |
+
# 4 add save file
|
47 |
+
# import requests
|
48 |
+
# from bs4 import BeautifulSoup
|
49 |
+
# import os
|
50 |
+
|
51 |
+
# # List of URLs to scrape
|
52 |
+
# urls = [
|
53 |
+
# 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue',
|
54 |
+
# # Add more URLs as needed
|
55 |
+
# ]
|
56 |
+
|
57 |
+
# for url in urls:
|
58 |
+
# response = requests.get(url)
|
59 |
+
# soup = BeautifulSoup(response.text, 'html.parser')
|
60 |
+
|
61 |
+
# # Extracting base name of the URL to use as the filename
|
62 |
+
# filename = os.path.basename(url).replace('%', '_').replace('?', '_') + '.txt'
|
63 |
+
|
64 |
+
# # Open a new text file for writing the scraped data
|
65 |
+
# with open(filename, 'w', encoding='utf-8') as file:
|
66 |
+
# # Write the URL to the file
|
67 |
+
# file.write(f'Content from {url}:\n')
|
68 |
+
|
69 |
+
# # Extract and write all paragraph texts for each URL
|
70 |
+
# paragraphs = soup.find_all('p')
|
71 |
+
# for paragraph in paragraphs:
|
72 |
+
# file.write(paragraph.text + '\n')
|
73 |
+
# file.write("\n") # Write a new line for better readability between different URLs
|
74 |
+
|
75 |
+
# # Extract and write all text from the body of the HTML document for each URL
|
76 |
+
# text = soup.body.get_text(separator=' ', strip=True)
|
77 |
+
# file.write(f'Full text from {url}:\n')
|
78 |
+
# file.write(text + '\n')
|
79 |
+
# file.write("="*100 + '\n') # Write a separator line for better readability between different URLs
|
80 |
+
|
81 |
+
# # Print out a message to let you know the data has been written to the file
|
82 |
+
# print(f'Scraped data from {url} has been saved to {filename}')
|
83 |
+
|
84 |
+
#5 It has internal link scrapping
|
85 |
+
# import requests
|
86 |
+
# from bs4 import BeautifulSoup
|
87 |
+
# import os
|
88 |
+
|
89 |
+
# # Initial list of main URLs to scan
|
90 |
+
# main_urls = [
|
91 |
+
# 'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
|
92 |
+
# # Add more main URLs as needed
|
93 |
+
# ]
|
94 |
+
|
95 |
+
# # Function to get all unique links from a given URL
|
96 |
+
# def get_all_links(url):
|
97 |
+
# response = requests.get(url)
|
98 |
+
# soup = BeautifulSoup(response.text, 'html.parser')
|
99 |
+
# links = soup.find_all('a')
|
100 |
+
# unique_links = set()
|
101 |
+
# for link in links:
|
102 |
+
# href = link.get('href')
|
103 |
+
# if href and href.startswith('/wiki/'): # Filters out unwanted links and keeps wikipedia internal links
|
104 |
+
# complete_link = f"https://en.wikipedia.org{href}"
|
105 |
+
# unique_links.add(complete_link)
|
106 |
+
# return list(unique_links)
|
107 |
+
|
108 |
+
# # Iterate over main URLs to get all specific links and scrape data from each
|
109 |
+
# for main_url in main_urls:
|
110 |
+
# urls = get_all_links(main_url) # Get all sub-links from the main URL
|
111 |
+
# for url in urls:
|
112 |
+
# response = requests.get(url)
|
113 |
+
# soup = BeautifulSoup(response.text, 'html.parser')
|
114 |
+
|
115 |
+
# # Extracting base name of the URL to use as the filename
|
116 |
+
# filename = os.path.basename(url).split('#')[0] # Remove URL fragments
|
117 |
+
# filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters
|
118 |
+
|
119 |
+
# # Open a new text file for writing the scraped data
|
120 |
+
# with open(filename, 'w', encoding='utf-8') as file:
|
121 |
+
# # Write the URL to the file
|
122 |
+
# file.write(f'Content from {url}:\n\n')
|
123 |
+
|
124 |
+
# # Extract and write all paragraph texts for each URL
|
125 |
+
# paragraphs = soup.find_all('p')
|
126 |
+
# for paragraph in paragraphs:
|
127 |
+
# file.write(paragraph.text + '\n\n')
|
128 |
+
# file.write("="*100 + '\n') # Write a separator line for better readability
|
129 |
+
|
130 |
+
# # Print out a message to let you know the data has been written to the file
|
131 |
+
# print(f'Scraped data from {url} has been saved to {filename}')
|
132 |
+
|
133 |
+
import requests
|
134 |
+
from bs4 import BeautifulSoup
|
135 |
+
import os
|
136 |
+
|
137 |
+
# Initial list of main URLs to scan
|
138 |
+
main_urls = [
|
139 |
+
'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
|
140 |
+
# Add more main URLs as needed
|
141 |
+
]
|
142 |
+
|
143 |
+
# Function to get all unique links from a given URL
|
144 |
+
def get_all_links(url):
|
145 |
+
response = requests.get(url)
|
146 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
147 |
+
links = soup.find_all('a')
|
148 |
+
unique_links = set()
|
149 |
+
for link in links:
|
150 |
+
href = link.get('href')
|
151 |
+
if href and not href.startswith('#') and not href.startswith('mailto:'): # Filters out unwanted links like anchors and emails
|
152 |
+
if not href.startswith('http'): # Check if the link is relative
|
153 |
+
href = url + href # Construct the complete URL
|
154 |
+
unique_links.add(href)
|
155 |
+
return list(unique_links)
|
156 |
+
|
157 |
+
# Iterate over main URLs to get all specific links and scrape data from each
|
158 |
+
for main_url in main_urls:
|
159 |
+
urls = get_all_links(main_url) # Get all sub-links from the main URL
|
160 |
+
for url in urls:
|
161 |
+
response = requests.get(url)
|
162 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
163 |
+
|
164 |
+
# Extracting base name of the URL to use as the filename
|
165 |
+
filename = os.path.basename(url).split('#')[0] # Remove URL fragments
|
166 |
+
filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters
|
167 |
+
|
168 |
+
# Open a new text file for writing the scraped data
|
169 |
+
with open(filename, 'w', encoding='utf-8') as file:
|
170 |
+
# Write the URL to the file
|
171 |
+
file.write(f'Content from {url}:\n\n')
|
172 |
+
|
173 |
+
# Extract and write all paragraph texts for each URL
|
174 |
+
paragraphs = soup.find_all('p')
|
175 |
+
for paragraph in paragraphs:
|
176 |
+
file.write(paragraph.text + '\n\n')
|
177 |
+
file.write("="*100 + '\n') # Write a separator line for better readability
|
178 |
+
|
179 |
+
# Print out a message to let you know the data has been written to the file
|
180 |
+
print(f'Scraped data from {url} has been saved to {filename}')
|
181 |
+
|