File size: 4,921 Bytes
7b74f94
 
 
bbe1453
e525960
b1c6e24
 
 
bbe1453
 
b1c6e24
 
 
bbe1453
 
6bb850b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbe1453
6bb850b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbe1453
 
 
e525960
 
 
 
 
 
bbe1453
 
 
6bb850b
 
 
bbe1453
6bb850b
 
bbe1453
6bb850b
 
 
 
bbe1453
6bb850b
 
 
bbe1453
6bb850b
 
 
 
 
7b74f94
6bb850b
 
 
bbe1453
 
 
 
 
 
 
 
 
6bb850b
bbe1453
6bb850b
bbe1453
 
 
 
7b74f94
6bb850b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141



import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pandas as pd
from difflib import SequenceMatcher
from xml.etree import ElementTree as ET
import openpyxl
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
import gradio as gr

visited_urls = set()
unique_urls = set()

def create_sitemap_from_url(home_page_url):
    def crawl_website(url):
        # Check if URL has already been visited
        if url in visited_urls:
            return

        # Add URL to visited set
        visited_urls.add(url)

        # Extract domain from the given URL
        parsed_url = urlparse(url)
        base_url = parsed_url.scheme + "://" + parsed_url.netloc

        # Make a GET request to the URL
        try:
            response = requests.get(url)
        except requests.exceptions.RequestException:
            # Handle unreadable URLs
            return

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Add the URL to the set of unique URLs
            unique_urls.add(url)

            # Extract all the links on the page
            links = soup.find_all('a')

            # Visit each link
            for link in links:
                href = link.get('href')
                if href and not href.startswith('#'):
                    # Construct the absolute URL by joining the base URL and the relative URL
                    absolute_url = urljoin(url, href)
                    parsed_absolute_url = urlparse(absolute_url)

                    # Check if the URL points to a webpage (excluding image URLs)
                    if parsed_absolute_url.netloc == parsed_url.netloc and not parsed_absolute_url.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.zip', '.apk', '.msi')):
                        try:
                            # Visit the absolute URL
                            crawl_website(absolute_url)
                        except requests.exceptions.RequestException:
                            # Handle unreadable URLs
                            continue
        else:
            # Handle unsuccessful requests
            return

    # Call the crawl_website function with the desired URL
    crawl_website(home_page_url)

    # Remove "http://" URLs that have matching content after "http://" in "https://" URLs
    final_urls = set()
    for url in unique_urls:
        if url.startswith("http://"):
            remaining_url = url[len("http://"):]
            if "https://" + remaining_url in unique_urls:
                continue
        final_urls.add(url)

    return final_urls

def fetch_and_save_to_excel(home_page_url):
    def fetch_page_info(url):
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('title').get_text() if soup.find('title') else 'No title found'
            keywords = soup.find('meta', {'name': 'keywords'})
            keywords = keywords.get('content') if keywords else 'No keywords found'
            description = soup.find('meta', {'name': 'description'})
            description = description.get('content') if description else 'No description found'
            return title, keywords, description
        return None, None, None

    urls = create_sitemap_from_url(home_page_url)
    if urls:
        title_to_urls = {}  # Dictionary to store URLs grouped by title

        for url in urls:
            title, _, _ = fetch_page_info(url)  # Fetch only title for comparison

            if title in title_to_urls:
                title_to_urls[title].append(url)
            else:
                title_to_urls[title] = [url]

        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.append(["URL", "Title", "Keywords", "Description"])

        for title, urls in title_to_urls.items():
            if len(urls) > 1:  # Only consider titles with multiple URLs
                for url in urls:
                    fetched_title, keywords, description = fetch_page_info(url)
                    sheet.append([url, fetched_title, keywords, description])

        excel_file = "duplicate_titles.xlsx"
        workbook.save(excel_file)
        return excel_file

    return None

# Create a Gradio interface
iface = gr.Interface(
    fn=fetch_and_save_to_excel,
    inputs="text",
    outputs="file",
    title="Duplicate Titles Finder and Excel Exporter",
    description="Enter a domain name (or homepage URL) to find duplicate titles and export the results to an Excel file.",
    allow_flagging=False,
    examples=[["http://www.embedded-innovations.com/"]]
)

# Launch the Gradio interface
iface.launch()