Spaces:
Runtime error
Runtime error
enstazao
commited on
Commit
•
cbcc9fd
1
Parent(s):
6bae4ea
added application files
Browse files- __pycache__/app.cpython-311.pyc +0 -0
- __pycache__/main.cpython-311.pyc +0 -0
- app.py +23 -0
- main.py +54 -0
- requirements.txt +3 -0
__pycache__/app.cpython-311.pyc
ADDED
Binary file (1.24 kB). View file
|
|
__pycache__/main.cpython-311.pyc
ADDED
Binary file (3.67 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from main import fetch_webpage_content, parse_and_segment_content, summarize_text
|
3 |
+
|
4 |
+
def summarize_webpage(url):
|
5 |
+
html_content = fetch_webpage_content(url)
|
6 |
+
if html_content:
|
7 |
+
chunks = parse_and_segment_content(html_content)
|
8 |
+
summary = summarize_text(chunks)
|
9 |
+
if summary:
|
10 |
+
return summary
|
11 |
+
else:
|
12 |
+
return "Failed to generate a summary."
|
13 |
+
else:
|
14 |
+
return "Failed to fetch or process webpage content."
|
15 |
+
|
16 |
+
interface = gr.Interface(fn=summarize_webpage,
|
17 |
+
inputs=gr.Textbox(lines=2, placeholder="Enter URL Here..."),
|
18 |
+
outputs="text",
|
19 |
+
title="Webpage Summarizer",
|
20 |
+
description="Paste the URL of a webpage to get a summarized content.")
|
21 |
+
|
22 |
+
if __name__ == "__main__":
|
23 |
+
interface.launch()
|
main.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import requests
|
4 |
+
|
5 |
+
# @desc Get the content of the web page
|
6 |
+
def fetch_webpage_content(url):
|
7 |
+
try:
|
8 |
+
response = requests.get(url)
|
9 |
+
response.raise_for_status() # Raises an HTTPError if the status is 4xx, 5xx
|
10 |
+
return response.text
|
11 |
+
except requests.exceptions.RequestException as e:
|
12 |
+
print(f"Error fetching the webpage: {e}")
|
13 |
+
return None
|
14 |
+
|
15 |
+
# @desc Get the chunks of the content from the the scrapped content
|
16 |
+
def parse_and_segment_content(html_content):
|
17 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
18 |
+
results = soup.find_all(['h1', 'p'])
|
19 |
+
text = ' '.join([result.text for result in results])
|
20 |
+
text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
|
21 |
+
sentences = text.split('<eos>')
|
22 |
+
|
23 |
+
max_chunk = 500
|
24 |
+
chunks = []
|
25 |
+
current_chunk = 0
|
26 |
+
|
27 |
+
for sentence in sentences:
|
28 |
+
if len(sentence) > 0: # Check if sentence is not empty
|
29 |
+
if len(chunks) == current_chunk + 1:
|
30 |
+
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
|
31 |
+
chunks[current_chunk].extend(sentence.split(' '))
|
32 |
+
else:
|
33 |
+
current_chunk += 1
|
34 |
+
chunks.append(sentence.split(' '))
|
35 |
+
else:
|
36 |
+
chunks.append(sentence.split(' '))
|
37 |
+
|
38 |
+
for chunk_id in range(len(chunks)):
|
39 |
+
chunks[chunk_id] = ' '.join(chunks[chunk_id]).strip()
|
40 |
+
|
41 |
+
return chunks
|
42 |
+
|
43 |
+
# @desc Summarize the content and then return that back
|
44 |
+
def summarize_text(chunks):
|
45 |
+
summarizer = pipeline("summarization")
|
46 |
+
summaries = []
|
47 |
+
try:
|
48 |
+
for chunk in chunks:
|
49 |
+
summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
|
50 |
+
summaries.append(summary[0]['summary_text'])
|
51 |
+
return ' '.join(summaries)
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Error in summarization: {e}")
|
54 |
+
return None
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
beautifulsoup4
|
3 |
+
requests
|