bonrix commited on
Commit
bbe1453
1 Parent(s): 4d63838

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -0
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from xml.etree import ElementTree as ET
4
+ import openpyxl
5
+ import gradio as gr
6
+ from urllib.parse import urlparse, urljoin
7
+
8
+ visited_urls = set()
9
+ unique_urls = set()
10
+
11
+ def create_sitemap_from_url(home_page_url):
12
+ def crawl_website(url):
13
+ # Check if URL has already been visited
14
+ if url in visited_urls:
15
+ return
16
+
17
+ # Add URL to visited set
18
+ visited_urls.add(url)
19
+
20
+ # Extract domain from the given URL
21
+ parsed_url = urlparse(url)
22
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
23
+
24
+ # Make a GET request to the URL
25
+ try:
26
+ response = requests.get(url)
27
+ except requests.exceptions.RequestException:
28
+ # Handle unreadable URLs
29
+ return
30
+
31
+ # Check if the request was successful
32
+ if response.status_code == 200:
33
+ # Parse the HTML content using BeautifulSoup
34
+ soup = BeautifulSoup(response.content, 'html.parser')
35
+
36
+ # Add the URL to the set of unique URLs
37
+ unique_urls.add(url)
38
+
39
+ # Extract all the links on the page
40
+ links = soup.find_all('a')
41
+
42
+ # Visit each link
43
+ for link in links:
44
+ href = link.get('href')
45
+ if href and not href.startswith('#'):
46
+ # Construct the absolute URL by joining the base URL and the relative URL
47
+ absolute_url = urljoin(url, href)
48
+ parsed_absolute_url = urlparse(absolute_url)
49
+
50
+ # Check if the URL points to a webpage (excluding image URLs)
51
+ if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
52
+ try:
53
+ # Visit the absolute URL
54
+ crawl_website(absolute_url)
55
+ except requests.exceptions.RequestException:
56
+ # Handle unreadable URLs
57
+ continue
58
+ else:
59
+ # Handle unsuccessful requests
60
+ return
61
+
62
+ # Call the crawl_website function with the desired URL
63
+ crawl_website(home_page_url)
64
+
65
+ # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
66
+ final_urls = set()
67
+ for url in unique_urls:
68
+ if url.startswith("http://"):
69
+ remaining_url = url[len("http://"):]
70
+ if "https://" + remaining_url in unique_urls:
71
+ continue
72
+ final_urls.add(url)
73
+
74
+ return final_urls
75
+
76
+ def fetch_and_save_to_excel(home_page_url):
77
+ def fetch_page_info(url):
78
+ response = requests.get(url)
79
+ if response.status_code == 200:
80
+ soup = BeautifulSoup(response.text, 'html.parser')
81
+ title = soup.find('title').get_text() if soup.find('title') else 'No title found'
82
+ keywords = soup.find('meta', {'name': 'keywords'})
83
+ keywords = keywords.get('content') if keywords else 'No keywords found'
84
+ description = soup.find('meta', {'name': 'description'})
85
+ description = description.get('content') if description else 'No description found'
86
+ return title, keywords, description
87
+ return None, None, None
88
+
89
+ urls = create_sitemap_from_url(home_page_url)
90
+ if urls:
91
+ title_to_urls = {} # Dictionary to store URLs grouped by title
92
+
93
+ for url in urls:
94
+ title, _, _ = fetch_page_info(url) # Fetch only title for comparison
95
+
96
+ if title in title_to_urls:
97
+ title_to_urls[title].append(url)
98
+ else:
99
+ title_to_urls[title] = [url]
100
+
101
+ workbook = openpyxl.Workbook()
102
+ sheet = workbook.active
103
+ sheet.append(["URL", "Title", "Keywords", "Description"])
104
+
105
+ for title, urls in title_to_urls.items():
106
+ if len(urls) > 1: # Only consider titles with multiple URLs
107
+ for url in urls:
108
+ fetched_title, keywords, description = fetch_page_info(url)
109
+ sheet.append([url, fetched_title, keywords, description])
110
+
111
+ excel_file = "duplicate_titles.xlsx"
112
+ workbook.save(excel_file)
113
+ return excel_file
114
+
115
+ return None
116
+
117
+ # Create a Gradio interface
118
+ iface = gr.Interface(
119
+ fn=fetch_and_save_to_excel,
120
+ inputs="text",
121
+ outputs="file",
122
+ title="Duplicate Titles Finder and Excel Exporter",
123
+ description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
124
+ allow_flagging=False,
125
+ examples=[["http://www.embedded-innovations.com/"]]
126
+ )
127
+
128
+ # Launch the Gradio interface
129
+ iface.launch()