QuangDinh2102 commited on
Commit
850cd75
2 Parent(s): bd794f9 94b61d7

phase3/quangdt: Merge from main -> phase3/quangdt

Browse files
.github/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Simple Genarates Question Test
2
+
3
+
4
+ ## Tech Stack
5
+
6
+ **Client:** Streamlit
7
+
8
+ **Server:** FastAPI
.github/workflows/huggingface.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 🤗 Sync to HuggingFace Space
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+
7
+ jobs:
8
+ sync-to-hub:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ # Checkout repo
12
+ - uses: actions/checkout@v3
13
+ with:
14
+ fetch-depth: 0
15
+ lfs: true
16
+
17
+ # Push to hub
18
+ - name: Push to hub
19
+ env:
20
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
21
+ run: git push -f https://MillMin:$HF_TOKEN@huggingface.co/spaces/MillMin/FSA-PROJECT-CV main
.gitignore CHANGED
@@ -5,4 +5,4 @@ credentials/
5
  data/CV
6
  data/JD
7
  data/QUESTION
8
- !.gitkeep
 
5
  data/CV
6
  data/JD
7
  data/QUESTION
8
+ !.gitkeep
dockerfile → Dockerfile RENAMED
@@ -1,4 +1,4 @@
1
- FROM python:3.10.9
2
 
3
  WORKDIR /code
4
 
@@ -8,4 +8,6 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
 
9
  COPY . .
10
 
11
- CMD ["python", "main.py"]
 
 
 
1
+ FROM python:3.11.7-alpine3.19
2
 
3
  WORKDIR /code
4
 
 
8
 
9
  COPY . .
10
 
11
+ EXPOSE 8000
12
+
13
+ CMD python main.py
README.md CHANGED
@@ -1,10 +1,8 @@
1
-
2
- # Simple Genarates Question Test
3
-
4
-
5
- ## Tech Stack
6
-
7
- **Client:** Streamlit
8
-
9
- **Server:** FastAPI
10
-
 
1
+ ---
2
+ title: FSA PROJECT CV
3
+ emoji: 🏢
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
+ ---
 
 
app/modules/crud_cvs/models/crud_cvs.py CHANGED
@@ -1,5 +1,7 @@
1
  import uuid
2
  from app.configs.database import firebase_bucket, firebase_db
 
 
3
 
4
  # CRUD operation
5
  def upload_file_cvs(file):
@@ -16,6 +18,23 @@ def remove_file_cvs(file_url):
16
  blob.delete()
17
  return True
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def get_all_cvs():
20
  # Get all documents from the collection
21
  docs = firebase_db.collection("cvs").stream()
 
1
  import uuid
2
  from app.configs.database import firebase_bucket, firebase_db
3
+ import io
4
+ from docx import Document
5
 
6
  # CRUD operation
7
  def upload_file_cvs(file):
 
18
  blob.delete()
19
  return True
20
 
21
+ def file_cv_doc2text(file_url):
22
+ # download file from firebase storage using "gs://" link
23
+ blob = firebase_bucket.blob(file_url.split(f"gs://{firebase_bucket.name}/")[1])
24
+ # download file and return string in file
25
+ file_bytes = blob.download_as_bytes()
26
+ # Create a BytesIO object from the file bytes
27
+ file_stream = io.BytesIO(file_bytes)
28
+ # Read the .docx file from the BytesIO object
29
+ doc = Document(file_stream)
30
+ # Extract text from the .docx file
31
+ text = ""
32
+ for paragraph in doc.paragraphs:
33
+ text += paragraph.text + "\n"
34
+
35
+ return text
36
+
37
+
38
  def get_all_cvs():
39
  # Get all documents from the collection
40
  docs = firebase_db.collection("cvs").stream()
app/modules/crud_jds/models/crud_jds.py CHANGED
@@ -17,6 +17,12 @@ def remove_file_jds(file_url):
17
  blob.delete()
18
  return True
19
 
 
 
 
 
 
 
20
  def get_all_jds():
21
  # Get all documents from the collection
22
  docs = firebase_db.collection("jds").stream()
 
17
  blob.delete()
18
  return True
19
 
20
+ def file_jd_txt2text(file_url):
21
+ # download file from firebase storage using "gs://" link
22
+ blob = firebase_bucket.blob(file_url.split(f"gs://{firebase_bucket.name}/")[1])
23
+ # download file and return string in file
24
+ return blob.download_as_text()
25
+
26
  def get_all_jds():
27
  # Get all documents from the collection
28
  docs = firebase_db.collection("jds").stream()
app/modules/matching_cv/__init__.py CHANGED
@@ -1,9 +1,11 @@
1
  import docx
2
 
3
  from fastapi import APIRouter
4
- from app.modules.matching_cv.models.match_cv_jd_model import Match_JD_CV_Model
5
 
6
- from app.modules.matching_cv.models.matching_cv_logic import result_matching_cv_jd, load_jd_from_id
 
 
7
 
8
  cvmatching_router = APIRouter(prefix="/cvmatching", tags=["cvmatching"])
9
 
@@ -13,24 +15,44 @@ async def index():
13
 
14
  @cvmatching_router.post("/matching")
15
  # only upload .pdf or .docx file
16
- async def matching_cv_jd(
17
- jd_upload: Match_JD_CV_Model.jd = Match_JD_CV_Model.jd_default,
18
- cv_upload: Match_JD_CV_Model.cv = Match_JD_CV_Model.cv_default):
19
  try:
20
- # take jd_upload and cv_upload type file
21
- jd_upload_type = jd_upload.filename.split(".")[-1]
22
- cv_upload_type = cv_upload.filename.split(".")[-1]
23
- if jd_upload_type in ["txt"] and cv_upload_type in ["pdf", "docx"]:
24
- jd_text = jd_upload.file.read().decode("utf-8")
25
- if cv_upload_type == "docx":
26
- cv_text = docx.Document(cv_upload.file).paragraphs
27
- cv_text = "\n".join([i.text for i in cv_text])
28
- elif cv_upload_type == "pdf":
29
- return {"message": "This feature is not available yet"}
30
- # check matching cv and jd
31
- result = result_matching_cv_jd(cv_text, jd_text)
32
- return {"result": result}
 
33
  else:
34
- return {"message": "Please upload only .txt for JD. And .pdf or .docx file for CV"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  except Exception as e:
36
  return {"Error": str(e)}
 
1
  import docx
2
 
3
  from fastapi import APIRouter
4
+ # from app.modules.matching_cv.models.match_cv_jd_model import Match_JD_CV_Model
5
 
6
+ from app.modules.matching_cv.models.matching_cv_logic import result_matching_cv_jd
7
+ from app.modules.crud_jds.models.crud_jds import get_jd_by_id, file_jd_txt2text
8
+ from app.modules.crud_cvs.models.crud_cvs import get_cv_by_id, file_cv_doc2text
9
 
10
  cvmatching_router = APIRouter(prefix="/cvmatching", tags=["cvmatching"])
11
 
 
15
 
16
  @cvmatching_router.post("/matching")
17
  # only upload .pdf or .docx file
18
+ async def matching_cv_jd(id_jd: str, id_cv:str):
 
 
19
  try:
20
+ # get jd and cv by id
21
+ jd_document = get_jd_by_id(id_jd)
22
+ cv_document = get_cv_by_id(id_cv)
23
+
24
+ # download file from firebase storage
25
+ jd_url = jd_document["jd_url"]
26
+ cv_url = cv_document["cv_url"]
27
+
28
+ # get type file cv from cv_url "gs://bucket_name/file_name"
29
+ cv_type = cv_url.split(".")[-1]
30
+ if cv_type == "pdf":
31
+ return {"message": "This feature is not available yet"}
32
+ elif cv_type == "docx":
33
+ cv_text = file_cv_doc2text(cv_url)
34
  else:
35
+ return {"message": "Please upload only .pdf or .docx file for CV"}
36
+
37
+ # get jd_text from jd_url "gs://bucket_name/file_name"
38
+ jd_text = file_jd_txt2text(jd_url)
39
+
40
+ result = result_matching_cv_jd(cv_text, jd_text)
41
+ return {"result": result}
42
+ # # take jd_upload and cv_upload type file
43
+ # jd_upload_type = jd_upload.filename.split(".")[-1]
44
+ # cv_upload_type = cv_upload.filename.split(".")[-1]
45
+ # if jd_upload_type in ["txt"] and cv_upload_type in ["pdf", "docx"]:
46
+ # jd_text = jd_upload.file.read().decode("utf-8")
47
+ # if cv_upload_type == "docx":
48
+ # cv_text = docx.Document(cv_upload.file).paragraphs
49
+ # cv_text = "\n".join([i.text for i in cv_text])
50
+ # elif cv_upload_type == "pdf":
51
+ # return {"message": "This feature is not available yet"}
52
+ # # check matching cv and jd
53
+ # result = result_matching_cv_jd(cv_text, jd_text)
54
+ # return {"result": result}
55
+ # else:
56
+ # return {"message": "Please upload only .txt for JD. And .pdf or .docx file for CV"}
57
  except Exception as e:
58
  return {"Error": str(e)}
app/modules/matching_cv/models/matching_cv_logic.py CHANGED
@@ -42,11 +42,8 @@ def result_matching_cv_jd(cv_text, jd_text):
42
  # create the chat message
43
  chat_message = chat_template.format_messages(cv=cv_text, jd=jd_text)
44
 
45
- llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY)
46
  chain = llm | parser
47
  result = chain.invoke(chat_message)
48
 
49
  return result
50
-
51
- def load_jd_from_id():
52
- pass
 
42
  # create the chat message
43
  chat_message = chat_template.format_messages(cv=cv_text, jd=jd_text)
44
 
45
+ llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY, request_timeout=120)
46
  chain = llm | parser
47
  result = chain.invoke(chat_message)
48
 
49
  return result
 
 
 
app/modules/question_tests_retrieval/__init__.py CHANGED
@@ -2,8 +2,8 @@ from fastapi import APIRouter, UploadFile, File
2
  from typing import Annotated
3
 
4
  from app.modules.question_tests_retrieval.models.jd2text import jobdes2text
5
- # from app.modules.question_tests_retrieval.models.text2vector import text2vector
6
  from app.modules.question_tests_retrieval.models.question_tests_logic import get_question_tests
 
7
 
8
  qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
9
 
@@ -11,18 +11,19 @@ qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
11
  async def index():
12
  return {"message": "Welcome to question retrieval page"}
13
 
14
- @qtretrieval_router.post("/send_jd")
15
  # only upload .txt file
16
- async def send_jd(txt_file: Annotated[UploadFile, File(..., description="The JD file (only .txt file)", media_type=["text/plain"])]):
17
  try:
18
- # read the txt file with format
19
- jobdes = txt_file.file.read().decode("utf-8")
20
- sumaryjd_text = jobdes2text(jobdes)
 
21
  if get_question_tests(sumaryjd_text):
22
  return {"message": "Send JD successfully and get question test successfully",
23
  "sumaryjd_text": sumaryjd_text}
24
  else:
25
  return {"message": "Please upload only .txt file", "error": str(e)}
26
  except Exception as e:
27
- return {"message": "Please upload only .txt file", "error": str(e)}
28
 
 
2
  from typing import Annotated
3
 
4
  from app.modules.question_tests_retrieval.models.jd2text import jobdes2text
 
5
  from app.modules.question_tests_retrieval.models.question_tests_logic import get_question_tests
6
+ from app.modules.crud_jds.models.crud_jds import get_jd_by_id, file_jd_txt2text
7
 
8
  qtretrieval_router = APIRouter(prefix="/qtretrieval", tags=["qtretrieval"])
9
 
 
11
  async def index():
12
  return {"message": "Welcome to question retrieval page"}
13
 
14
+ @qtretrieval_router.post("/send_jd_to_get_question")
15
  # only upload .txt file
16
+ async def send_jd_to_get_question(id_jd: str):
17
  try:
18
+ jd_document = get_jd_by_id(id_jd)
19
+ # download jd file from firebase storage
20
+ jd_file_string = file_jd_txt2text(jd_document["jd_url"])
21
+ sumaryjd_text = jobdes2text(jd_file_string)
22
  if get_question_tests(sumaryjd_text):
23
  return {"message": "Send JD successfully and get question test successfully",
24
  "sumaryjd_text": sumaryjd_text}
25
  else:
26
  return {"message": "Please upload only .txt file", "error": str(e)}
27
  except Exception as e:
28
+ return {"message": "Have error when find JD in database", "error": str(e)}
29
 
app/modules/question_tests_retrieval/models/jd2text.py CHANGED
@@ -17,7 +17,7 @@ parser = JsonOutputParser()
17
 
18
  def jobdes2text(jobdes: str) -> str:
19
  # setup the gemini pro
20
- llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY)
21
 
22
  # create the prompt template
23
  finnal_jd_chat_template = ChatPromptTemplate.from_messages(
@@ -27,7 +27,7 @@ def jobdes2text(jobdes: str) -> str:
27
  """Return Job title, level(Fresher, Junior, Senior, ...) and Brief summary of required skills about 20 words from the job description. Use the following format: Job Title is {job title}, Level is {level}, and Brief summary of required skills is {brief summary of required skills}."""
28
  )
29
  ),
30
- HumanMessagePromptTemplate.from_template("{text}"),
31
  ]
32
  )
33
 
 
17
 
18
  def jobdes2text(jobdes: str) -> str:
19
  # setup the gemini pro
20
+ llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, convert_system_message_to_human=True, api_key=GOOGLE_API_KEY, request_timeout=120)
21
 
22
  # create the prompt template
23
  finnal_jd_chat_template = ChatPromptTemplate.from_messages(
 
27
  """Return Job title, level(Fresher, Junior, Senior, ...) and Brief summary of required skills about 20 words from the job description. Use the following format: Job Title is {job title}, Level is {level}, and Brief summary of required skills is {brief summary of required skills}."""
28
  )
29
  ),
30
+ HumanMessagePromptTemplate.from_template("{text}"),
31
  ]
32
  )
33
 
app/modules/question_tests_retrieval/models/question_tests_logic.py CHANGED
@@ -18,7 +18,7 @@ os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
18
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
19
 
20
  # Setting model embedding
21
- embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
22
  gemini_evaluator = load_evaluator("embedding_distance", distance_metric=EmbeddingDistance.COSINE, embeddings=embedding_model)
23
 
24
  # def compare_vector(vector_extract, vector_des):
 
18
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
19
 
20
  # Setting model embedding
21
+ embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY, request_timeout=120)
22
  gemini_evaluator = load_evaluator("embedding_distance", distance_metric=EmbeddingDistance.COSINE, embeddings=embedding_model)
23
 
24
  # def compare_vector(vector_extract, vector_des):
app/modules/question_tests_retrieval/models/text2vector.py CHANGED
@@ -10,6 +10,6 @@ os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
10
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
11
 
12
  def text2vector(text):
13
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
14
  vector = embeddings.embed_query(text)
15
  return vector
 
10
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
11
 
12
  def text2vector(text):
13
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY, request_timeout=120)
14
  vector = embeddings.embed_query(text)
15
  return vector
scrapping.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+
4
+ url = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue"
5
+ page = requests.get(url)
6
+ soup = BeautifulSoup(page.text, 'html')
7
+
8
+ print(soup)
tools/crawl_data.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import requests
2
+ # from bs4 import BeautifulSoup
3
+
4
+ # url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue' # Replace with the URL you intend to scrape
5
+ # response = requests.get(url)
6
+ # soup = BeautifulSoup(response.text, 'html.parser')
7
+
8
+ # # Example of extracting all paragraph texts
9
+ # paragraphs = soup.find_all('p')
10
+ # for paragraph in paragraphs:
11
+ # print(paragraph.text)
12
+
13
+
14
+ # # Extract all text from the body of the HTML document
15
+ # text = soup.body.get_text(separator=' ', strip=True)
16
+ # print(text)
17
+
18
+ #2
19
+
20
+ # import requests
21
+ # from bs4 import BeautifulSoup
22
+
23
+ # # List of URLs to scrape
24
+ # urls = [
25
+ # 'https://vietnix.vn/java-la-gi/', 'https://200lab.io/blog/python-la-gi/'
26
+ # # Add more URLs as needed
27
+ # ]
28
+
29
+ # for url in urls:
30
+ # response = requests.get(url)
31
+ # soup = BeautifulSoup(response.text, 'html.parser')
32
+
33
+ # # Extract and print all paragraph texts for each URL
34
+ # paragraphs = soup.find_all('p')
35
+ # print(f'Content from {url}:')
36
+ # for paragraph in paragraphs:
37
+ # print(paragraph.text)
38
+ # print("\n") # Print a new line for better readability between different URLs
39
+
40
+ # # Extract all text from the body of the HTML document for each URL
41
+ # text = soup.body.get_text(separator=' ', strip=True)
42
+ # print(f'Full text from {url}:')
43
+ # print(text)
44
+ # print("="*100) # Print a separator line for better readability between different URLs
45
+
46
+ # 4 add save file
47
+ # import requests
48
+ # from bs4 import BeautifulSoup
49
+ # import os
50
+
51
+ # # List of URLs to scrape
52
+ # urls = [
53
+ # 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue',
54
+ # # Add more URLs as needed
55
+ # ]
56
+
57
+ # for url in urls:
58
+ # response = requests.get(url)
59
+ # soup = BeautifulSoup(response.text, 'html.parser')
60
+
61
+ # # Extracting base name of the URL to use as the filename
62
+ # filename = os.path.basename(url).replace('%', '_').replace('?', '_') + '.txt'
63
+
64
+ # # Open a new text file for writing the scraped data
65
+ # with open(filename, 'w', encoding='utf-8') as file:
66
+ # # Write the URL to the file
67
+ # file.write(f'Content from {url}:\n')
68
+
69
+ # # Extract and write all paragraph texts for each URL
70
+ # paragraphs = soup.find_all('p')
71
+ # for paragraph in paragraphs:
72
+ # file.write(paragraph.text + '\n')
73
+ # file.write("\n") # Write a new line for better readability between different URLs
74
+
75
+ # # Extract and write all text from the body of the HTML document for each URL
76
+ # text = soup.body.get_text(separator=' ', strip=True)
77
+ # file.write(f'Full text from {url}:\n')
78
+ # file.write(text + '\n')
79
+ # file.write("="*100 + '\n') # Write a separator line for better readability between different URLs
80
+
81
+ # # Print out a message to let you know the data has been written to the file
82
+ # print(f'Scraped data from {url} has been saved to {filename}')
83
+
84
+ #5 It has internal link scrapping
85
+ # import requests
86
+ # from bs4 import BeautifulSoup
87
+ # import os
88
+
89
+ # # Initial list of main URLs to scan
90
+ # main_urls = [
91
+ # 'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
92
+ # # Add more main URLs as needed
93
+ # ]
94
+
95
+ # # Function to get all unique links from a given URL
96
+ # def get_all_links(url):
97
+ # response = requests.get(url)
98
+ # soup = BeautifulSoup(response.text, 'html.parser')
99
+ # links = soup.find_all('a')
100
+ # unique_links = set()
101
+ # for link in links:
102
+ # href = link.get('href')
103
+ # if href and href.startswith('/wiki/'): # Filters out unwanted links and keeps wikipedia internal links
104
+ # complete_link = f"https://en.wikipedia.org{href}"
105
+ # unique_links.add(complete_link)
106
+ # return list(unique_links)
107
+
108
+ # # Iterate over main URLs to get all specific links and scrape data from each
109
+ # for main_url in main_urls:
110
+ # urls = get_all_links(main_url) # Get all sub-links from the main URL
111
+ # for url in urls:
112
+ # response = requests.get(url)
113
+ # soup = BeautifulSoup(response.text, 'html.parser')
114
+
115
+ # # Extracting base name of the URL to use as the filename
116
+ # filename = os.path.basename(url).split('#')[0] # Remove URL fragments
117
+ # filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters
118
+
119
+ # # Open a new text file for writing the scraped data
120
+ # with open(filename, 'w', encoding='utf-8') as file:
121
+ # # Write the URL to the file
122
+ # file.write(f'Content from {url}:\n\n')
123
+
124
+ # # Extract and write all paragraph texts for each URL
125
+ # paragraphs = soup.find_all('p')
126
+ # for paragraph in paragraphs:
127
+ # file.write(paragraph.text + '\n\n')
128
+ # file.write("="*100 + '\n') # Write a separator line for better readability
129
+
130
+ # # Print out a message to let you know the data has been written to the file
131
+ # print(f'Scraped data from {url} has been saved to {filename}')
132
+
133
+ import requests
134
+ from bs4 import BeautifulSoup
135
+ import os
136
+
137
+ # Initial list of main URLs to scan
138
+ main_urls = [
139
+ 'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
140
+ # Add more main URLs as needed
141
+ ]
142
+
143
+ # Function to get all unique links from a given URL
144
+ def get_all_links(url):
145
+ response = requests.get(url)
146
+ soup = BeautifulSoup(response.text, 'html.parser')
147
+ links = soup.find_all('a')
148
+ unique_links = set()
149
+ for link in links:
150
+ href = link.get('href')
151
+ if href and not href.startswith('#') and not href.startswith('mailto:'): # Filters out unwanted links like anchors and emails
152
+ if not href.startswith('http'): # Check if the link is relative
153
+ href = url + href # Construct the complete URL
154
+ unique_links.add(href)
155
+ return list(unique_links)
156
+
157
+ # Iterate over main URLs to get all specific links and scrape data from each
158
+ for main_url in main_urls:
159
+ urls = get_all_links(main_url) # Get all sub-links from the main URL
160
+ for url in urls:
161
+ response = requests.get(url)
162
+ soup = BeautifulSoup(response.text, 'html.parser')
163
+
164
+ # Extracting base name of the URL to use as the filename
165
+ filename = os.path.basename(url).split('#')[0] # Remove URL fragments
166
+ filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters
167
+
168
+ # Open a new text file for writing the scraped data
169
+ with open(filename, 'w', encoding='utf-8') as file:
170
+ # Write the URL to the file
171
+ file.write(f'Content from {url}:\n\n')
172
+
173
+ # Extract and write all paragraph texts for each URL
174
+ paragraphs = soup.find_all('p')
175
+ for paragraph in paragraphs:
176
+ file.write(paragraph.text + '\n\n')
177
+ file.write("="*100 + '\n') # Write a separator line for better readability
178
+
179
+ # Print out a message to let you know the data has been written to the file
180
+ print(f'Scraped data from {url} has been saved to {filename}')
181
+