Engineer786 commited on
Commit
0a3a0c3
·
verified ·
1 Parent(s): 960736f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +104 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from bs4 import BeautifulSoup
4
+ import urllib3
5
+ import pandas as pd
6
+ import tempfile
7
+ from groq import Groq
8
+
9
+ # Initialize Groq client
10
+ client = Groq(api_key=os.environ.get('GroqApi'))
11
+
12
+ # Initialize session state for scraped data
13
+ if "scraped_data" not in st.session_state:
14
+ st.session_state.scraped_data = []
15
+
16
+ def scrape_web_data(url, scrape_option):
17
+ """Scrape data from the given URL based on the scrape option."""
18
+ try:
19
+ # Create a PoolManager with urllib3 to handle SSL
20
+ http = urllib3.PoolManager()
21
+
22
+ # Send an HTTP request
23
+ response = http.request('GET', url)
24
+
25
+ # Check if the request was successful (status code 200)
26
+ if response.status == 200:
27
+ # Parse the HTML content of the page
28
+ soup = BeautifulSoup(response.data, 'html.parser')
29
+
30
+ # Prepare the output data
31
+ if scrape_option == 'data':
32
+ all_text = soup.get_text()
33
+ return [{'Data': line.strip()} for line in all_text.split('\n') if line.strip()]
34
+ elif scrape_option == 'links':
35
+ links = soup.find_all('a')
36
+ return [{'Links': link.get('href')} for link in links if link.get('href')]
37
+ else:
38
+ st.write(f"Error: {response.status}")
39
+ except Exception as e:
40
+ st.write(f"An error occurred: {e}")
41
+ return []
42
+
43
+ def process_query_with_groq(query, data):
44
+ """Process the user's query with Groq based on the scraped data."""
45
+ if not data:
46
+ return "No data available to process. Please scrape data first."
47
+
48
+ try:
49
+ # Combine the scraped data into a single text block
50
+ combined_text = "\n".join([str(item) for sublist in data for item in sublist.values()])
51
+
52
+ # Add the query context
53
+ prompt = f"Context: {combined_text}\n\nUser Query: {query}\nAnswer:"
54
+
55
+ # Call Groq API
56
+ response = client.chat.completions.create(
57
+ messages=[
58
+ {"role": "user", "content": prompt}
59
+ ],
60
+ model="llama3-8b-8192",
61
+ )
62
+ return response.choices[0].message.content
63
+ except Exception as e:
64
+ return f"Error processing query with Groq: {e}"
65
+
66
+ # Streamlit UI
67
+ st.title("Web Scraping and Query Tool")
68
+
69
+ # Step 1: Scraping
70
+ st.subheader("Step 1: Scrape Data")
71
+ website_url = st.text_input("Enter the URL to scrape:")
72
+ scrape_option = st.selectbox("Select what to scrape:", ['data', 'links'])
73
+
74
+ if st.button("Scrape Data"):
75
+ scraped_data = scrape_web_data(website_url, scrape_option)
76
+ if scraped_data:
77
+ st.session_state.scraped_data = scraped_data
78
+ st.success(f"Scraping completed. {len(scraped_data)} items found.")
79
+
80
+ # Save data to a temporary CSV file
81
+ df = pd.DataFrame(scraped_data)
82
+ csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
83
+ df.to_csv(csv_file.name, index=False)
84
+
85
+ # Provide a download button for the CSV file
86
+ st.download_button(
87
+ label="Download Scraped Data as CSV",
88
+ data=open(csv_file.name, "rb").read(),
89
+ file_name="scraped_data.csv",
90
+ mime="text/csv",
91
+ )
92
+ else:
93
+ st.warning("No data found. Please check the URL or scrape option.")
94
+
95
+ # Step 2: Querying
96
+ st.subheader("Step 2: Ask a Query")
97
+ user_query = st.text_input("Enter your query:")
98
+ if st.button("Get Answer"):
99
+ if user_query.strip() == "":
100
+ st.warning("Please enter a valid query.")
101
+ else:
102
+ answer = process_query_with_groq(user_query, st.session_state.scraped_data)
103
+ st.write("**Answer:**")
104
+ st.write(answer)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ beautifulsoup4
3
+ urllib3
4
+ pandas
5
+ groq