File size: 3,534 Bytes
e67d52b
 
 
 
05f7c2a
0d9ed80
94ab218
73a33e7
0d9ed80
92a289d
 
e67d52b
0d9ed80
73a33e7
e67d52b
0d9ed80
73a33e7
 
0d9ed80
 
73a33e7
 
0d9ed80
73a33e7
e67d52b
0d9b82c
94ab218
0d9b82c
 
 
 
05f7c2a
 
 
 
 
0d9b82c
 
 
e67d52b
0d9ed80
73a33e7
92a289d
0d9ed80
73a33e7
e67d52b
0d9ed80
73a33e7
 
0d9ed80
05f7c2a
e67d52b
92a289d
05f7c2a
 
e67d52b
 
 
 
 
05f7c2a
 
 
 
 
92a289d
 
05f7c2a
 
 
 
 
92a289d
05f7c2a
 
 
 
 
 
 
 
 
 
92a289d
05f7c2a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import streamlit as st
from googlesearch import search
import requests
from bs4 import BeautifulSoup
import chunk  # Importing the chunk module

# Function to perform Google search and return the first two links
def google_search(query):
    try:
        search_results = search(query, num_results=2)  # Get first two results
        first_two_links = [next(search_results, None), next(search_results, None)]
        return first_two_links
    except Exception as e:
        st.error(f"An error occurred: {e}")
        return None

# Function to fetch webpage content
def fetch_webpage_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        return response.text
    except Exception as e:
        st.error(f"Failed to fetch the webpage content: {e}")
        return None

# Function to scrape text from webpage content using BeautifulSoup
def scrape_text(webpage_content):
    try:
        soup = BeautifulSoup(webpage_content, 'html.parser')
        for script in soup(["script", "style"]):
            script.decompose()  # Remove unnecessary elements
        text = soup.get_text()  # Get raw text
        lines = (line.strip() for line in text.splitlines())  # Strip lines
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))  # Split and clean
        text = '\n'.join(chunk for chunk in chunks if chunk)  # Join cleaned text
        return text
    except Exception as e:
        st.error(f"Failed to scrape text from webpage content: {e}")
        return None

# Streamlit app UI
st.title("Search and Chunk Webpage Content")

# Input field for search query
query = st.text_input("Enter search query", "")

# Button to trigger search
if st.button("Search"):
    if query:
        first_two_links = google_search(query)  # Get first two links
        if first_two_links:
            for i, link in enumerate(first_two_links, 1):
                st.success(f"Link {i}: [Click here]({link})")  # Display links
                
                # Fetch webpage content
                webpage_content = fetch_webpage_content(link)
                if webpage_content:
                    # Scrape text from webpage content
                    scraped_text = scrape_text(webpage_content)
                    
                    if scraped_text:  # Ensure scraped_text is not empty
                        st.write(f"Scraped Content for Link {i}:")
                        st.text(scraped_text[:500])  # Display first 500 characters of the content
                        
                        # Chunk the scraped text using chunk.py
                        chunked_text = chunk.chunk_text(scraped_text)
                        
                        if chunked_text:  # Ensure chunked_text is not empty
                            st.write(f"Chunked Data for Link {i}:")
                            for chunk_part in chunked_text:
                                st.write(chunk_part)  # Display each chunk

                            # Save and download chunked data using chunk.py
                            chunk.save_and_download_chunked_data(chunked_text, file_name=f"chunked_data_link_{i}.txt")
                        else:
                            st.warning("No chunked data available")
                    else:
                        st.warning("No content scraped from this link")
        else:
            st.warning("No results found")
    else:
        st.error("Please enter a query")