Spaces:

mgokg
/

gemini-2.0-flash-exp

Running

App Files Files Community

mgokg commited on Dec 2, 2024

Commit

2f3bf94

verified ·

1 Parent(s): e410dd0

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -18

app.py CHANGED Viewed

@@ -5,32 +5,51 @@ from urllib.parse import urljoin
 def parse_links_and_content(ort):
     base_url = "https://vereine-in-deutschland.net"
     # Konstruiere die vollständige URL
-    url = f"{base_url}/vereine/Bayern/{ort}"
     try:
-        # Senden der Anfrage an die URL
-        response = requests.get(url)
         response.raise_for_status()  # Überprüfen, ob die Anfrage erfolgreich war
         # Parse the HTML content using BeautifulSoup
         soup = BeautifulSoup(response.content, 'html.parser')
-        # Finde das Element mit dem CSS-Selektor
-        target_div = soup.select_one('div.row-cols-1:nth-child(4)')
-        if target_div:
-            # Extrahiere alle Links aus dem Element und füge die Base URL hinzu
-            links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
-            # Extrahiere den HTML-Code des Elements
-            html_code = str(target_div)
-            return html_code, links
         else:
-            return "Target div not found", []
     except Exception as e:
         return str(e), []
 def scrape_links(links):
     results = []
@@ -57,20 +76,19 @@ with gr.Blocks() as demo:
     gr.Markdown("# Vereine in Bayern Parser")
     ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
-    html_output = gr.Code(label="HTML-Code des Elements", language="html")
     links_output = gr.JSON(label="Gefundene Links")
     content_output = gr.JSON(label="Inhalt der Links")
     def process_ort(ort):
-        html_code, links = parse_links_and_content(ort)
         scraped_content = scrape_links(links)
-        return html_code, links, scraped_content
     # Button zum Starten der Parsung
     button = gr.Button("Parse und Scrape")
     # Verbinde den Button mit der Funktion
-    button.click(fn=process_ort, inputs=ort_input, outputs=[html_output, links_output, content_output])
 # Starte die Gradio-Anwendung
-demo.launch()

 def parse_links_and_content(ort):
     base_url = "https://vereine-in-deutschland.net"
+    all_links = []
     # Konstruiere die vollständige URL
+    initial_url = f"{base_url}/vereine/Bayern/{ort}/p/1"
     try:
+        # Senden der Anfrage an die initiale URL
+        response = requests.get(initial_url)
         response.raise_for_status()  # Überprüfen, ob die Anfrage erfolgreich war
         # Parse the HTML content using BeautifulSoup
         soup = BeautifulSoup(response.content, 'html.parser')
+        # Ermittle die letzte Seite
+        link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
+        if link_element and 'href' in link_element.attrs:
+            href = link_element['href']
+            # Extrahiere die letzten beiden Zeichen der URL
+            last_two_chars = href[-2:]
+            # Konvertiere die letzten beiden Zeichen in einen Integer
+            last_two_chars_int = int(last_two_chars)
         else:
+            last_two_chars_int = 1  # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt
+        # Schleife durch alle Seiten und sammle Links
+        for page_number in range(1, last_two_chars_int + 1):
+            page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
+            response = requests.get(page_url)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            target_div = soup.select_one('div.row-cols-1:nth-child(4)')
+            if target_div:
+                links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
+                all_links.extend(links)
+            else:
+                print(f"Target div not found on page {page_number}")
     except Exception as e:
         return str(e), []
+    return all_links
 def scrape_links(links):
     results = []
     gr.Markdown("# Vereine in Bayern Parser")
     ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
     links_output = gr.JSON(label="Gefundene Links")
     content_output = gr.JSON(label="Inhalt der Links")
     def process_ort(ort):
+        links = parse_links_and_content(ort)
         scraped_content = scrape_links(links)
+        return links, scraped_content
     # Button zum Starten der Parsung
     button = gr.Button("Parse und Scrape")
     # Verbinde den Button mit der Funktion
+    button.click(fn=process_ort, inputs=ort_input, outputs=[links_output, content_output])
 # Starte die Gradio-Anwendung
+demo.launch()