Makima57 commited on
Commit
92a289d
1 Parent(s): e3ff8ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -32
app.py CHANGED
@@ -2,19 +2,13 @@ import streamlit as st
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
5
- import chunk # Import the chunking functionality from app2.py
6
 
7
  # Function to perform Google search and return the first two links
8
  def google_search(query):
9
  try:
10
- query = query + "/t site:https://medium.com/"
11
- search_results = search(query, num_results=10) # Get up to 10 results
12
- first_two_links = []
13
- for i, link in enumerate(search_results):
14
- if i < 2:
15
- first_two_links.append(link)
16
- else:
17
- break
18
  return first_two_links
19
  except Exception as e:
20
  st.error(f"An error occurred: {e}")
@@ -34,15 +28,11 @@ def fetch_webpage_content(url):
34
  def scrape_text(webpage_content):
35
  try:
36
  soup = BeautifulSoup(webpage_content, 'html.parser')
37
- # Remove all script and style elements
38
  for script in soup(["script", "style"]):
39
  script.decompose()
40
  text = soup.get_text()
41
- # Break the text into lines and remove leading/trailing spaces
42
  lines = (line.strip() for line in text.splitlines())
43
- # Break multi-headlines into a line each
44
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
45
- # Drop blank lines
46
  text = '\n'.join(chunk for chunk in chunks if chunk)
47
  return text
48
  except Exception as e:
@@ -50,7 +40,7 @@ def scrape_text(webpage_content):
50
  return None
51
 
52
  # Streamlit app UI
53
- st.title("Search Link Finder")
54
 
55
  # Input field for search query
56
  query = st.text_input("Enter search query", "")
@@ -60,8 +50,8 @@ if st.button("Search"):
60
  if query:
61
  first_two_links = google_search(query)
62
  if first_two_links:
63
- for i, link in enumerate(first_two_links):
64
- st.success(f"Link {i+1}: [Click here]({link})")
65
 
66
  # Fetch webpage content
67
  webpage_content = fetch_webpage_content(link)
@@ -69,19 +59,12 @@ if st.button("Search"):
69
  # Scrape text from webpage content
70
  scraped_text = scrape_text(webpage_content)
71
  if scraped_text:
72
- st.write(f"Scraped Content from Link {i+1} (Chunked):")
73
-
74
- # Call the chunking function from app2.py
75
- chunk.display_chunks(scraped_text)
76
-
77
- # Option to download the entire scraped content
78
- st.download_button(
79
- label=f"Download Full Webpage Content from Link {i+1}",
80
- data=scraped_text,
81
- file_name=f"webpage_content_{i+1}.txt",
82
- mime="text/plain"
83
- )
84
- else:
85
- st.warning("No results found")
86
- else:
87
- st.error("Please enter a query")
 
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
5
+ import chunk # Import the chunking functions from chunk.py
6
 
7
  # Function to perform Google search and return the first two links
8
  def google_search(query):
9
  try:
10
+ search_results = search(query, num_results=2) # Get first two results
11
+ first_two_links = [next(search_results, None), next(search_results, None)]
 
 
 
 
 
 
12
  return first_two_links
13
  except Exception as e:
14
  st.error(f"An error occurred: {e}")
 
28
  def scrape_text(webpage_content):
29
  try:
30
  soup = BeautifulSoup(webpage_content, 'html.parser')
 
31
  for script in soup(["script", "style"]):
32
  script.decompose()
33
  text = soup.get_text()
 
34
  lines = (line.strip() for line in text.splitlines())
 
35
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
 
36
  text = '\n'.join(chunk for chunk in chunks if chunk)
37
  return text
38
  except Exception as e:
 
40
  return None
41
 
42
  # Streamlit app UI
43
+ st.title("Search and Chunk Webpage Content")
44
 
45
  # Input field for search query
46
  query = st.text_input("Enter search query", "")
 
50
  if query:
51
  first_two_links = google_search(query)
52
  if first_two_links:
53
+ for i, link in enumerate(first_two_links, 1):
54
+ st.success(f"Link {i}: [Click here]({link})")
55
 
56
  # Fetch webpage content
57
  webpage_content = fetch_webpage_content(link)
 
59
  # Scrape text from webpage content
60
  scraped_text = scrape_text(webpage_content)
61
  if scraped_text:
62
+ # Chunk the scraped text using chunk.py
63
+ chunked_text = chunk.chunk_text(scraped_text)
64
+
65
+ st.write(f"Chunked Data for Link {i}:")
66
+ for chunk_part in chunked_text:
67
+ st.write(chunk_part)
68
+
69
+ # Save and download chunked data using the function from chunk.py
70
+ chunk.save_and_download_chunked_data(chunked_text, file_name=f"chunked_data_link_{i}.txt")