File size: 6,111 Bytes
a936419
 
 
04f7cb6
085ef0b
ae59393
 
1c3a250
a936419
085ef0b
e5d9b98
0963c3d
085ef0b
 
 
 
 
 
 
5399f24
740258d
c51298d
085ef0b
 
56a9e2e
085ef0b
740258d
 
 
 
c51298d
740258d
085ef0b
740258d
 
 
 
5535edf
0be39ee
04f7cb6
740258d
 
 
085ef0b
e410dd0
2f3bf94
085ef0b
 
e410dd0
085ef0b
 
2f3bf94
1e4afd6
2f3bf94
 
085ef0b
 
 
 
2f3bf94
 
085ef0b
2f3bf94
 
085ef0b
2f3bf94
740258d
085ef0b
030bd9a
740258d
2f3bf94
 
085ef0b
e410dd0
 
085ef0b
920c8fd
740258d
085ef0b
96b654f
5535edf
6b3b0cd
 
eee6618
 
6b3b0cd
 
085ef0b
3b28bd9
085ef0b
 
 
 
 
 
5f6d292
7e0c763
 
 
 
 
 
 
bb571f1
cc1e631
 
 
 
 
bedbbfa
085ef0b
bb571f1
085ef0b
7e0c763
085ef0b
 
3b28bd9
e5d9b98
3b28bd9
085ef0b
 
 
 
 
c1c8a1e
 
e5d9b98
1e4afd6
e5d9b98
1e4afd6
085ef0b
a936419
ba5cdce
85deaff
 
5399f24
0be39ee
5535edf
 
085ef0b
 
cc1e631
 
 
 
 
949d21c
cc1e631
 
 
 
9a23bef
9abe4cd
709f660
f842420
c577d47
0265349
5503bf5
2e1e0e1
f842420
a4ab0fc
63c2204
085ef0b
1e4afd6
e5d9b98
 
085ef0b
 
fe746c1
 
e5d9b98
085ef0b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
import json
import csv
#import pandas as pd

# Load environment variables
api_key = os.environ.get('GROQ_API_KEY')
read_key = os.environ.get('HF_TOKEN', None)

# Initialize Groq client
if api_key:
    from groq import Client as GroqClient
    client = GroqClient(api_key=api_key)
else:
    client = None

# Use Llama 3 70B powered by Groq for answering
def ask_llm(ort):
    if not client:
        return "Groq API key not set."
    
    try:
        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
            ],
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error in response generation: {str(e)}"

def parse_links_and_content(ort):
    
    base_url = "https://vereine-in-deutschland.net"
    all_links = []
    all_links_text = []
    initial_url = f"{base_url}/vereine/Bayern/{ort}"

    try:
        response = requests.get(initial_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Determine the last page
        link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
        last_page = 10
        if link_element and 'href' in link_element.attrs:
            href = link_element['href']
            last_page = int(href.split('/')[-1])

        # Loop through all pages and collect links
        for page_number in range(1, last_page + 1):
            page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
            response = requests.get(page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            target_div = soup.select_one('div.row-cols-1:nth-child(4)')

            if target_div:
                links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
                texts = [a.text for a in target_div.find_all('a', href=True)]
                all_links.extend(links)
                all_links_text.extend(texts)
            else:
                print(f"Target div not found on page {page_number}")

    except Exception as e:
        return str(e), []

    all_links = all_links[0::2]
    all_links_text = all_links_text[0::2]

    return all_links_text, all_links

def extract_vereinsname(url):
    parts = url.split('/')
    vereinsname = parts[-1]
    vereinsname = vereinsname.replace("-"," ")
    return vereinsname

def scrape_links(links):
    details = []
    for link in links:
        try:
            response = requests.get(link)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            target_nav = soup.select_one('.nav')
            parts = link.split('/')
            
            # Log the URL and its parts for debugging
            print(f"Processing URL: {link}")
            print(f"URL parts: {parts}")
            
            # Extract the name of the Verein from the URL
            vereinsname = parts[-1] if parts[-1] else parts[-2]  # Fallback to the second-to-last part if the last part is empty
            texte = target_nav.text.strip()
            texte = texte.replace("Amtsgericht: Schweinfurt", "")
            texte = texte.replace("Adresse folgt", "")
            texte = texte.replace("Adresse", "Adresse:")
            texte = texte.replace("Kontakt", "Email:")
            texte = texte.replace("Noch keine Daten vorhanden", "")

            if target_nav:
                details.append(f"Verein: {vereinsname} {texte}")
            else:
                details.append(f"Verein: {vereinsname} - No contact information found")
        except Exception as e:
            details.append(f"Error: {str(e)}")

    return details

def save_to_csv(data, filename):
    keys = data[0].keys() if data else []
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

# Clear output
def clear():
    return "", ""

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("[![Download](https://specialist-it.de/downloadbut.png)](https://specialist-it.de/verein.csv)")
    with gr.Row():
        ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
    with gr.Row():
        details_output = gr.Textbox(label="Vereinsliste")

    def process_ort(ort):
        links_text, links = parse_links_and_content(ort)
        contact_details = scrape_links(links)
        
        from gradio_client import Client

        qwen_client = Client("Qwen/Qwen2.5-72B-Instruct")
        result = qwen_client.predict(
            query=f"return a valid json objects with contact details foreach verein. return the generated json only \n {contact_details}",
            history=[],
            system="you are a expert for json data and your job is to extract information from text and return a valid json object only. no text no explanations",
            api_name="/model_chat"
        )
        #result[1]=gr.Markdown()
        #dict_data = json.loads(result[1])
        #values = list(result.values())
        # Return the value at index 1
        json_data =result[1][0][1]
        #json_dat = json_data[1]
        #json_dat=gr.Markdown()
        #return result
        #return result[1]
        json_data = gr.Markdown(json_data)
        return json_data

    with gr.Row():
        clearbutton = gr.Button("Clear")  
        button = gr.Button("Senden")    

    # Connect the button to the function
    button.click(fn=process_ort, inputs=ort_input, outputs=details_output)
    clearbutton.click(fn=clear, inputs=[], outputs=details_output)

# Launch the Gradio application
demo.launch()