enstazao commited on
Commit
cbcc9fd
1 Parent(s): 6bae4ea

added application files

Browse files
__pycache__/app.cpython-311.pyc ADDED
Binary file (1.24 kB). View file
 
__pycache__/main.cpython-311.pyc ADDED
Binary file (3.67 kB). View file
 
app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from main import fetch_webpage_content, parse_and_segment_content, summarize_text
3
+
4
+ def summarize_webpage(url):
5
+ html_content = fetch_webpage_content(url)
6
+ if html_content:
7
+ chunks = parse_and_segment_content(html_content)
8
+ summary = summarize_text(chunks)
9
+ if summary:
10
+ return summary
11
+ else:
12
+ return "Failed to generate a summary."
13
+ else:
14
+ return "Failed to fetch or process webpage content."
15
+
16
+ interface = gr.Interface(fn=summarize_webpage,
17
+ inputs=gr.Textbox(lines=2, placeholder="Enter URL Here..."),
18
+ outputs="text",
19
+ title="Webpage Summarizer",
20
+ description="Paste the URL of a webpage to get a summarized content.")
21
+
22
+ if __name__ == "__main__":
23
+ interface.launch()
main.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+
5
+ # @desc Get the content of the web page
6
+ def fetch_webpage_content(url):
7
+ try:
8
+ response = requests.get(url)
9
+ response.raise_for_status() # Raises an HTTPError if the status is 4xx, 5xx
10
+ return response.text
11
+ except requests.exceptions.RequestException as e:
12
+ print(f"Error fetching the webpage: {e}")
13
+ return None
14
+
15
+ # @desc Get the chunks of the content from the the scrapped content
16
+ def parse_and_segment_content(html_content):
17
+ soup = BeautifulSoup(html_content, 'html.parser')
18
+ results = soup.find_all(['h1', 'p'])
19
+ text = ' '.join([result.text for result in results])
20
+ text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
21
+ sentences = text.split('<eos>')
22
+
23
+ max_chunk = 500
24
+ chunks = []
25
+ current_chunk = 0
26
+
27
+ for sentence in sentences:
28
+ if len(sentence) > 0: # Check if sentence is not empty
29
+ if len(chunks) == current_chunk + 1:
30
+ if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
31
+ chunks[current_chunk].extend(sentence.split(' '))
32
+ else:
33
+ current_chunk += 1
34
+ chunks.append(sentence.split(' '))
35
+ else:
36
+ chunks.append(sentence.split(' '))
37
+
38
+ for chunk_id in range(len(chunks)):
39
+ chunks[chunk_id] = ' '.join(chunks[chunk_id]).strip()
40
+
41
+ return chunks
42
+
43
+ # @desc Summarize the content and then return that back
44
+ def summarize_text(chunks):
45
+ summarizer = pipeline("summarization")
46
+ summaries = []
47
+ try:
48
+ for chunk in chunks:
49
+ summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
50
+ summaries.append(summary[0]['summary_text'])
51
+ return ' '.join(summaries)
52
+ except Exception as e:
53
+ print(f"Error in summarization: {e}")
54
+ return None
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ beautifulsoup4
3
+ requests