Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,148 +2,126 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin
|
5 |
-
|
6 |
import json
|
7 |
import csv
|
8 |
-
import pandas
|
9 |
-
import groq
|
10 |
-
import os
|
11 |
|
|
|
12 |
api_key = os.environ.get('groq')
|
13 |
read_key = os.environ.get('HF_TOKEN', None)
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# Use Llama 3 70B powered by Groq for answering
|
17 |
def ask_llm(ort):
|
|
|
|
|
18 |
|
19 |
-
try:
|
20 |
completion = client.chat.completions.create(
|
21 |
model="llama3-70b-8192",
|
22 |
messages=[
|
23 |
{"role": "system", "content": "You are a helpful assistant."},
|
24 |
{"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
|
25 |
],
|
26 |
-
)
|
27 |
return completion.choices[0].message.content
|
28 |
except Exception as e:
|
29 |
return f"Error in response generation: {str(e)}"
|
30 |
|
31 |
def parse_links_and_content(ort):
|
32 |
-
|
33 |
base_url = "https://vereine-in-deutschland.net"
|
34 |
all_links = []
|
35 |
all_links_text = []
|
36 |
initial_url = f"{base_url}/vereine/Bayern/{ort}"
|
37 |
-
|
38 |
try:
|
39 |
response = requests.get(initial_url)
|
40 |
-
response.raise_for_status()
|
41 |
-
|
42 |
-
# Parse the HTML content using BeautifulSoup
|
43 |
soup = BeautifulSoup(response.content, 'html.parser')
|
44 |
-
|
45 |
-
#
|
46 |
link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
|
47 |
-
|
48 |
if link_element and 'href' in link_element.attrs:
|
49 |
href = link_element['href']
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
print(last_two_chars_int)
|
55 |
-
else:
|
56 |
-
last_two_chars_int = 10 # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt
|
57 |
-
|
58 |
-
# Schleife durch alle Seiten und sammle Links
|
59 |
-
for page_number in range(1, last_two_chars_int +1):
|
60 |
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
|
61 |
response = requests.get(page_url)
|
62 |
-
response.raise_for_status()
|
63 |
soup = BeautifulSoup(response.content, 'html.parser')
|
64 |
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
|
65 |
-
|
66 |
if target_div:
|
67 |
links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
|
68 |
-
texts = [a.text for a in target_div.find_all('a', href=True)]
|
69 |
-
#print(texts)
|
70 |
all_links.extend(links)
|
71 |
all_links_text.extend(texts)
|
72 |
else:
|
73 |
print(f"Target div not found on page {page_number}")
|
74 |
-
|
75 |
except Exception as e:
|
76 |
return str(e), []
|
77 |
-
|
78 |
all_links = all_links[0::2]
|
79 |
all_links_text = all_links_text[0::2]
|
80 |
-
|
81 |
return all_links_text, all_links
|
82 |
|
83 |
-
def scrape_links(links):
|
84 |
-
links=links
|
85 |
details = []
|
86 |
-
for
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
return details
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
api_name="/parse_links"
|
102 |
-
)
|
103 |
-
#print(result)
|
104 |
-
contact_details.append(result)
|
105 |
-
|
106 |
-
return contact_details
|
107 |
-
|
108 |
-
# Speichere die JSON-Daten in eine CSV-Datei
|
109 |
-
def save_to_csv(data, filename):
|
110 |
-
keys = data[0].keys()
|
111 |
-
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
|
112 |
-
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
|
113 |
-
dict_writer.writeheader()
|
114 |
dict_writer.writerows(data)
|
115 |
|
116 |
-
#
|
117 |
with gr.Blocks() as demo:
|
118 |
-
gr.Markdown("# ")
|
119 |
with gr.Row():
|
120 |
ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
|
121 |
with gr.Row():
|
122 |
-
links_output = gr.JSON(label="
|
123 |
-
|
124 |
-
#links_output = gr.DataFrame(label="Ergebnisse")
|
125 |
-
#json_output = gr.JSON(label="Ergebnisse")
|
126 |
|
127 |
def process_ort(ort):
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
return json_data, links
|
140 |
-
#return json_data
|
141 |
-
|
142 |
-
# Button zum Starten der Parsung
|
143 |
-
button = gr.Button("senden")
|
144 |
-
|
145 |
-
# Verbinde den Button mit der Funktion
|
146 |
-
button.click(fn=process_ort, inputs=ort_input, outputs=[links_output, rechts_output])
|
147 |
|
148 |
-
#
|
149 |
-
demo.launch()
|
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin
|
5 |
+
import os
|
6 |
import json
|
7 |
import csv
|
|
|
|
|
|
|
8 |
|
9 |
+
# Load environment variables
|
10 |
api_key = os.environ.get('groq')
|
11 |
read_key = os.environ.get('HF_TOKEN', None)
|
12 |
+
|
13 |
+
# Initialize Groq client
|
14 |
+
if api_key:
|
15 |
+
from groq import Client as GroqClient
|
16 |
+
client = GroqClient(api_key=api_key)
|
17 |
+
else:
|
18 |
+
client = None
|
19 |
|
20 |
# Use Llama 3 70B powered by Groq for answering
|
21 |
def ask_llm(ort):
|
22 |
+
if not client:
|
23 |
+
return "Groq API key not set."
|
24 |
|
25 |
+
try:
|
26 |
completion = client.chat.completions.create(
|
27 |
model="llama3-70b-8192",
|
28 |
messages=[
|
29 |
{"role": "system", "content": "You are a helpful assistant."},
|
30 |
{"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
|
31 |
],
|
32 |
+
)
|
33 |
return completion.choices[0].message.content
|
34 |
except Exception as e:
|
35 |
return f"Error in response generation: {str(e)}"
|
36 |
|
37 |
def parse_links_and_content(ort):
|
|
|
38 |
base_url = "https://vereine-in-deutschland.net"
|
39 |
all_links = []
|
40 |
all_links_text = []
|
41 |
initial_url = f"{base_url}/vereine/Bayern/{ort}"
|
42 |
+
|
43 |
try:
|
44 |
response = requests.get(initial_url)
|
45 |
+
response.raise_for_status()
|
46 |
+
|
|
|
47 |
soup = BeautifulSoup(response.content, 'html.parser')
|
48 |
+
|
49 |
+
# Determine the last page
|
50 |
link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
|
51 |
+
last_page = 1
|
52 |
if link_element and 'href' in link_element.attrs:
|
53 |
href = link_element['href']
|
54 |
+
last_page = int(href.split('/')[-1])
|
55 |
+
|
56 |
+
# Loop through all pages and collect links
|
57 |
+
for page_number in range(1, last_page + 1):
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
|
59 |
response = requests.get(page_url)
|
60 |
+
response.raise_for_status()
|
61 |
soup = BeautifulSoup(response.content, 'html.parser')
|
62 |
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
|
63 |
+
|
64 |
if target_div:
|
65 |
links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
|
66 |
+
texts = [a.text for a in target_div.find_all('a', href=True)]
|
|
|
67 |
all_links.extend(links)
|
68 |
all_links_text.extend(texts)
|
69 |
else:
|
70 |
print(f"Target div not found on page {page_number}")
|
71 |
+
|
72 |
except Exception as e:
|
73 |
return str(e), []
|
74 |
+
|
75 |
all_links = all_links[0::2]
|
76 |
all_links_text = all_links_text[0::2]
|
77 |
+
|
78 |
return all_links_text, all_links
|
79 |
|
80 |
+
def scrape_links(links):
|
|
|
81 |
details = []
|
82 |
+
for link in links:
|
83 |
+
try:
|
84 |
+
response = requests.get(link)
|
85 |
+
response.raise_for_status()
|
86 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
87 |
+
target_nav = soup.select_one('.nav')
|
88 |
+
if target_nav:
|
89 |
+
details.append(target_nav.text.strip())
|
90 |
+
else:
|
91 |
+
details.append("No contact information found")
|
92 |
+
except Exception as e:
|
93 |
+
details.append(f"Error: {str(e)}")
|
94 |
|
95 |
return details
|
96 |
|
97 |
+
def save_to_csv(data, filename):
|
98 |
+
keys = data[0].keys() if data else []
|
99 |
+
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
|
100 |
+
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
|
101 |
+
dict_writer.writeheader()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
dict_writer.writerows(data)
|
103 |
|
104 |
+
# Create the Gradio interface
|
105 |
with gr.Blocks() as demo:
|
106 |
+
gr.Markdown("# Vereine in Deutschland")
|
107 |
with gr.Row():
|
108 |
ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
|
109 |
with gr.Row():
|
110 |
+
links_output = gr.JSON(label="Links")
|
111 |
+
details_output = gr.JSON(label="Details")
|
|
|
|
|
112 |
|
113 |
def process_ort(ort):
|
114 |
+
links_text, links = parse_links_and_content(ort)
|
115 |
+
contact_details = scrape_links(links)
|
116 |
+
json_data = [json.loads(detail) for detail in contact_details if detail.startswith("{")]
|
117 |
+
save_to_csv(json_data, './contact_details.csv')
|
118 |
+
return links_text, contact_details
|
119 |
+
|
120 |
+
# Button to start the parsing
|
121 |
+
button = gr.Button("Senden")
|
122 |
+
|
123 |
+
# Connect the button to the function
|
124 |
+
button.click(fn=process_ort, inputs=ort_input, outputs=[links_output, details_output])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
+
# Launch the Gradio application
|
127 |
+
demo.launch()
|