Spaces:

bonrix
/

url_to_duplicate_title_finder

Runtime error

App Files Files Community

bonrix commited on Aug 11, 2023

Commit

bbe1453

•

1 Parent(s): 4d63838

Create app.py

Browse files

Files changed (1) hide show

app.py +129 -0

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import requests
+from bs4 import BeautifulSoup
+from xml.etree import ElementTree as ET
+import openpyxl
+import gradio as gr
+from urllib.parse import urlparse, urljoin
+visited_urls = set()
+unique_urls = set()
+def create_sitemap_from_url(home_page_url):
+    def crawl_website(url):
+        # Check if URL has already been visited
+        if url in visited_urls:
+            return
+        # Add URL to visited set
+        visited_urls.add(url)
+        # Extract domain from the given URL
+        parsed_url = urlparse(url)
+        base_url = parsed_url.scheme + "://" + parsed_url.netloc
+        # Make a GET request to the URL
+        try:
+            response = requests.get(url)
+        except requests.exceptions.RequestException:
+            # Handle unreadable URLs
+            return
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Parse the HTML content using BeautifulSoup
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Add the URL to the set of unique URLs
+            unique_urls.add(url)
+            # Extract all the links on the page
+            links = soup.find_all('a')
+            # Visit each link
+            for link in links:
+                href = link.get('href')
+                if href and not href.startswith('#'):
+                    # Construct the absolute URL by joining the base URL and the relative URL
+                    absolute_url = urljoin(url, href)
+                    parsed_absolute_url = urlparse(absolute_url)
+                    # Check if the URL points to a webpage (excluding image URLs)
+                    if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
+                        try:
+                            # Visit the absolute URL
+                            crawl_website(absolute_url)
+                        except requests.exceptions.RequestException:
+                            # Handle unreadable URLs
+                            continue
+        else:
+            # Handle unsuccessful requests
+            return
+    # Call the crawl_website function with the desired URL
+    crawl_website(home_page_url)
+    # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
+    final_urls = set()
+    for url in unique_urls:
+        if url.startswith("http://"):
+            remaining_url = url[len("http://"):]
+            if "https://" + remaining_url in unique_urls:
+                continue
+        final_urls.add(url)
+    return final_urls
+def fetch_and_save_to_excel(home_page_url):
+    def fetch_page_info(url):
+        response = requests.get(url)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text, 'html.parser')
+            title = soup.find('title').get_text() if soup.find('title') else 'No title found'
+            keywords = soup.find('meta', {'name': 'keywords'})
+            keywords = keywords.get('content') if keywords else 'No keywords found'
+            description = soup.find('meta', {'name': 'description'})
+            description = description.get('content') if description else 'No description found'
+            return title, keywords, description
+        return None, None, None
+    urls = create_sitemap_from_url(home_page_url)
+    if urls:
+        title_to_urls = {}  # Dictionary to store URLs grouped by title
+        for url in urls:
+            title, _, _ = fetch_page_info(url)  # Fetch only title for comparison
+            if title in title_to_urls:
+                title_to_urls[title].append(url)
+            else:
+                title_to_urls[title] = [url]
+        workbook = openpyxl.Workbook()
+        sheet = workbook.active
+        sheet.append(["URL", "Title", "Keywords", "Description"])
+        for title, urls in title_to_urls.items():
+            if len(urls) > 1:  # Only consider titles with multiple URLs
+                for url in urls:
+                    fetched_title, keywords, description = fetch_page_info(url)
+                    sheet.append([url, fetched_title, keywords, description])
+        excel_file = "duplicate_titles.xlsx"
+        workbook.save(excel_file)
+        return excel_file
+    return None
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=fetch_and_save_to_excel,
+    inputs="text",
+    outputs="file",
+    title="Duplicate Titles Finder and Excel Exporter",
+    description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
+    allow_flagging=False,
+    examples=[["http://www.embedded-innovations.com/"]]
+)
+# Launch the Gradio interface
+iface.launch()