enstazao commited on
Commit
50eb7ef
1 Parent(s): 9a4b00c

updated code

Browse files
Files changed (1) hide show
  1. main.py +41 -28
main.py CHANGED
@@ -2,53 +2,66 @@ from transformers import pipeline
2
  from bs4 import BeautifulSoup
3
  import requests
4
 
5
- # @desc Get the content of the web page
6
  def fetch_webpage_content(url):
 
7
  try:
8
- response = requests.get(url)
9
- response.raise_for_status() # Raises an HTTPError if the status is 4xx, 5xx
10
  return response.text
11
  except requests.exceptions.RequestException as e:
12
  print(f"Error fetching the webpage: {e}")
13
  return None
14
 
15
- # @desc Get the chunks of the content from the the scrapped content
16
  def parse_and_segment_content(html_content):
 
 
 
 
17
  soup = BeautifulSoup(html_content, 'html.parser')
18
  results = soup.find_all(['h1', 'p'])
19
  text = ' '.join([result.text for result in results])
20
  text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
21
  sentences = text.split('<eos>')
22
-
23
  max_chunk = 500
24
  chunks = []
25
- current_chunk = 0
26
-
27
  for sentence in sentences:
28
- if len(sentence) > 0: # Check if sentence is not empty
29
- if len(chunks) == current_chunk + 1:
30
- if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
31
- chunks[current_chunk].extend(sentence.split(' '))
32
- else:
33
- current_chunk += 1
34
- chunks.append(sentence.split(' '))
35
- else:
36
- chunks.append(sentence.split(' '))
37
-
38
- for chunk_id in range(len(chunks)):
39
- chunks[chunk_id] = ' '.join(chunks[chunk_id]).strip()
40
-
41
  return chunks
42
 
43
- # @desc Summarize the content and then return that back
44
  def summarize_text(chunks):
45
- summarizer = pipeline("summarization")
 
 
 
 
46
  summaries = []
47
- try:
48
- for chunk in chunks:
 
49
  summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
50
  summaries.append(summary[0]['summary_text'])
51
- return ' '.join(summaries)
52
- except Exception as e:
53
- print(f"Error in summarization: {e}")
54
- return None
 
 
 
 
 
 
 
 
 
 
 
 
2
  from bs4 import BeautifulSoup
3
  import requests
4
 
 
5
  def fetch_webpage_content(url):
6
+ """Fetch the content of a webpage."""
7
  try:
8
+ response = requests.get(url, timeout=10) # Ensures the use of standard HTTP/HTTPS ports
9
+ response.raise_for_status() # Raises an error for bad responses
10
  return response.text
11
  except requests.exceptions.RequestException as e:
12
  print(f"Error fetching the webpage: {e}")
13
  return None
14
 
 
15
  def parse_and_segment_content(html_content):
16
+ """Parse and segment HTML content into manageable chunks."""
17
+ if not html_content:
18
+ return []
19
+
20
  soup = BeautifulSoup(html_content, 'html.parser')
21
  results = soup.find_all(['h1', 'p'])
22
  text = ' '.join([result.text for result in results])
23
  text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
24
  sentences = text.split('<eos>')
25
+
26
  max_chunk = 500
27
  chunks = []
28
+ current_chunk = -1
29
+
30
  for sentence in sentences:
31
+ if len(sentence.strip()) == 0:
32
+ continue
33
+ if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk:
34
+ chunks.append([])
35
+ current_chunk += 1
36
+ chunks[current_chunk].extend(sentence.split())
37
+
38
+ chunks = [' '.join(chunk).strip() for chunk in chunks]
 
 
 
 
 
39
  return chunks
40
 
 
41
  def summarize_text(chunks):
42
+ """Summarize the given text chunks."""
43
+ if not chunks:
44
+ return "No content to summarize."
45
+
46
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
47
  summaries = []
48
+
49
+ for chunk in chunks:
50
+ try:
51
  summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
52
  summaries.append(summary[0]['summary_text'])
53
+ except Exception as e:
54
+ print(f"Error in summarization: {e}")
55
+ summaries.append("Error summarizing text.") # Keep the flow even if summarization fails
56
+
57
+ return ' '.join(summaries)
58
+
59
+ # Example usage
60
+ # url = "https://example.com"
61
+ # html_content = fetch_webpage_content(url)
62
+ # if html_content:
63
+ # chunks = parse_and_segment_content(html_content)
64
+ # summary = summarize_text(chunks)
65
+ # print(summary)
66
+ # else:
67
+ # print("Failed to fetch or parse webpage content.")