File size: 4,873 Bytes
a936419
 
 
04f7cb6
f81e9c7
ae59393
 
dde4a77
fa394de
dde4a77
a936419
fa394de
0963c3d
56a9e2e
5399f24
740258d
c51298d
56a9e2e
740258d
 
 
 
 
c51298d
740258d
 
 
 
 
 
5535edf
740258d
04f7cb6
740258d
 
 
a936419
e410dd0
2f3bf94
e410dd0
a936419
e410dd0
 
 
2f3bf94
 
e410dd0
2f3bf94
 
 
740258d
2f3bf94
 
8011a7d
e410dd0
708e323
2f3bf94
 
740258d
2f3bf94
 
ae59393
2f3bf94
 
 
 
740258d
75278c7
b74e7f8
740258d
 
2f3bf94
 
 
e410dd0
 
cefaac5
920c8fd
740258d
96b654f
5535edf
 
b66ac54
 
db7669b
b66ac54
 
a789a4d
b66ac54
 
 
 
 
 
 
c1c8a1e
 
 
 
 
 
 
 
 
a936419
 
85deaff
 
 
5399f24
ed5bc85
 
b74e7f8
a28b6b3
5535edf
 
96b654f
56a9e2e
96b654f
2f3bf94
5399f24
c1c8a1e
3a2f717
a8a4b07
 
a28b6b3
3a2f717
96b654f
c1c8a1e
ae59393
a936419
a28a8aa
a936419
 
5399f24
a936419
 
2f3bf94
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from gradio_client import Client
import json
import csv
import pandas
import groq
import os

api_key = os.environ.get('groq')
read_key = os.environ.get('HF_TOKEN', None)
client = groq.Client(api_key=api_key)

# Use Llama 3 70B powered by Groq for answering
def ask_llm(ort):
    
    try:        
        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
            ],
        )       
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error in response generation: {str(e)}"

def parse_links_and_content(ort):
    
    base_url = "https://vereine-in-deutschland.net"
    all_links = []
    all_links_text = []
    initial_url = f"{base_url}/vereine/Bayern/{ort}"
    
    try:
        response = requests.get(initial_url)
        response.raise_for_status()  # Überprüfen, ob die Anfrage erfolgreich war
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Ermittle die letzte Seite
        link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
        
        if link_element and 'href' in link_element.attrs:
            href = link_element['href']
            # Extrahiere die letzten beiden Zeichen der URL
            last_two_chars = href[-2:].strip()          
            # Konvertiere die letzten beiden Zeichen in einen Integer
            last_two_chars_int = int(last_two_chars)
            print(last_two_chars_int)
        else:
            last_two_chars_int = 10  # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt

        # Schleife durch alle Seiten und sammle Links
        for page_number in range(1, last_two_chars_int +1):
            page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
            response = requests.get(page_url)
            response.raise_for_status()            
            soup = BeautifulSoup(response.content, 'html.parser')
            target_div = soup.select_one('div.row-cols-1:nth-child(4)')
            
            if target_div:
                links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
                texts = [a.text for a in target_div.find_all('a', href=True)] 
                #print(texts)
                all_links.extend(links)
                all_links_text.extend(texts)
            else:
                print(f"Target div not found on page {page_number}")
        
    except Exception as e:
        return str(e), []
        
    all_links = all_links[0::2]
    all_links_text = all_links_text[0::2]
    return all_links_text, all_links

def scrape_links(links):
    links=links
    contact_details= []
    client = Client("mgokg/PerplexicaApi")
    for verein in links:   
        result = client.predict(
            
    		prompt=f"{verein}",
    		api_name="/parse_links"
    )
        #print(result)
        contact_details.append(result)
        
    return contact_details

# Speichere die JSON-Daten in eine CSV-Datei 
def save_to_csv(data, filename): 
    keys = data[0].keys() 
    with open(filename, 'w', newline='', encoding='utf-8') as output_file: 
        dict_writer = csv.DictWriter(output_file, fieldnames=keys) 
        dict_writer.writeheader() 
        dict_writer.writerows(data)

# Erstelle die Gradio-Schnittstelle
with gr.Blocks() as demo:
    gr.Markdown("# ")
    with gr.Row():
        ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
    with gr.Row():
        links_output = gr.JSON(label="Antwort")
        rechts_output = gr.JSON(label="Antwort")
    #links_output = gr.DataFrame(label="Ergebnisse")
    #json_output = gr.JSON(label="Ergebnisse")

    def process_ort(ort):
        #antwort = ask_llm(ort)
        #antwort=gr.Markdown()
        #return antwort
        links = parse_links_and_content(ort)
        return links
        contact= scrape_links(links)      
        json_data = [json.loads(item) for item in contact] 
        #save_to_csv(json_data, './contact_details.csv')
        #return f"[Download CSV](contact_details.csv)", json_data
        #return json_data
        #return contact
        return json_data, links
        #return json_data
        
    # Button zum Starten der Parsung
    button = gr.Button("senden")
    
    # Verbinde den Button mit der Funktion
    button.click(fn=parse_links_and_content, inputs=ort_input, outputs=[links_output, rechts_output])

# Starte die Gradio-Anwendung
demo.launch()