Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from xml.etree import ElementTree as ET
|
4 |
+
import openpyxl
|
5 |
+
import gradio as gr
|
6 |
+
from urllib.parse import urlparse, urljoin
|
7 |
+
|
8 |
+
visited_urls = set()
|
9 |
+
unique_urls = set()
|
10 |
+
|
11 |
+
def create_sitemap_from_url(home_page_url):
|
12 |
+
def crawl_website(url):
|
13 |
+
# Check if URL has already been visited
|
14 |
+
if url in visited_urls:
|
15 |
+
return
|
16 |
+
|
17 |
+
# Add URL to visited set
|
18 |
+
visited_urls.add(url)
|
19 |
+
|
20 |
+
# Extract domain from the given URL
|
21 |
+
parsed_url = urlparse(url)
|
22 |
+
base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
23 |
+
|
24 |
+
# Make a GET request to the URL
|
25 |
+
try:
|
26 |
+
response = requests.get(url)
|
27 |
+
except requests.exceptions.RequestException:
|
28 |
+
# Handle unreadable URLs
|
29 |
+
return
|
30 |
+
|
31 |
+
# Check if the request was successful
|
32 |
+
if response.status_code == 200:
|
33 |
+
# Parse the HTML content using BeautifulSoup
|
34 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
35 |
+
|
36 |
+
# Add the URL to the set of unique URLs
|
37 |
+
unique_urls.add(url)
|
38 |
+
|
39 |
+
# Extract all the links on the page
|
40 |
+
links = soup.find_all('a')
|
41 |
+
|
42 |
+
# Visit each link
|
43 |
+
for link in links:
|
44 |
+
href = link.get('href')
|
45 |
+
if href and not href.startswith('#'):
|
46 |
+
# Construct the absolute URL by joining the base URL and the relative URL
|
47 |
+
absolute_url = urljoin(url, href)
|
48 |
+
parsed_absolute_url = urlparse(absolute_url)
|
49 |
+
|
50 |
+
# Check if the URL points to a webpage (excluding image URLs)
|
51 |
+
if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
|
52 |
+
try:
|
53 |
+
# Visit the absolute URL
|
54 |
+
crawl_website(absolute_url)
|
55 |
+
except requests.exceptions.RequestException:
|
56 |
+
# Handle unreadable URLs
|
57 |
+
continue
|
58 |
+
else:
|
59 |
+
# Handle unsuccessful requests
|
60 |
+
return
|
61 |
+
|
62 |
+
# Call the crawl_website function with the desired URL
|
63 |
+
crawl_website(home_page_url)
|
64 |
+
|
65 |
+
# Remove "http://" URLs that have matching content after "http://" in "https://" URLs
|
66 |
+
final_urls = set()
|
67 |
+
for url in unique_urls:
|
68 |
+
if url.startswith("http://"):
|
69 |
+
remaining_url = url[len("http://"):]
|
70 |
+
if "https://" + remaining_url in unique_urls:
|
71 |
+
continue
|
72 |
+
final_urls.add(url)
|
73 |
+
|
74 |
+
return final_urls
|
75 |
+
|
76 |
+
def fetch_and_save_to_excel(home_page_url):
|
77 |
+
def fetch_page_info(url):
|
78 |
+
response = requests.get(url)
|
79 |
+
if response.status_code == 200:
|
80 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
81 |
+
title = soup.find('title').get_text() if soup.find('title') else 'No title found'
|
82 |
+
keywords = soup.find('meta', {'name': 'keywords'})
|
83 |
+
keywords = keywords.get('content') if keywords else 'No keywords found'
|
84 |
+
description = soup.find('meta', {'name': 'description'})
|
85 |
+
description = description.get('content') if description else 'No description found'
|
86 |
+
return title, keywords, description
|
87 |
+
return None, None, None
|
88 |
+
|
89 |
+
urls = create_sitemap_from_url(home_page_url)
|
90 |
+
if urls:
|
91 |
+
title_to_urls = {} # Dictionary to store URLs grouped by title
|
92 |
+
|
93 |
+
for url in urls:
|
94 |
+
title, _, _ = fetch_page_info(url) # Fetch only title for comparison
|
95 |
+
|
96 |
+
if title in title_to_urls:
|
97 |
+
title_to_urls[title].append(url)
|
98 |
+
else:
|
99 |
+
title_to_urls[title] = [url]
|
100 |
+
|
101 |
+
workbook = openpyxl.Workbook()
|
102 |
+
sheet = workbook.active
|
103 |
+
sheet.append(["URL", "Title", "Keywords", "Description"])
|
104 |
+
|
105 |
+
for title, urls in title_to_urls.items():
|
106 |
+
if len(urls) > 1: # Only consider titles with multiple URLs
|
107 |
+
for url in urls:
|
108 |
+
fetched_title, keywords, description = fetch_page_info(url)
|
109 |
+
sheet.append([url, fetched_title, keywords, description])
|
110 |
+
|
111 |
+
excel_file = "duplicate_titles.xlsx"
|
112 |
+
workbook.save(excel_file)
|
113 |
+
return excel_file
|
114 |
+
|
115 |
+
return None
|
116 |
+
|
117 |
+
# Create a Gradio interface
|
118 |
+
iface = gr.Interface(
|
119 |
+
fn=fetch_and_save_to_excel,
|
120 |
+
inputs="text",
|
121 |
+
outputs="file",
|
122 |
+
title="Duplicate Titles Finder and Excel Exporter",
|
123 |
+
description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
|
124 |
+
allow_flagging=False,
|
125 |
+
examples=[["http://www.embedded-innovations.com/"]]
|
126 |
+
)
|
127 |
+
|
128 |
+
# Launch the Gradio interface
|
129 |
+
iface.launch()
|