claytonsamples commited on
Commit
e38196e
·
1 Parent(s): d804491

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -31
app.py CHANGED
@@ -3,11 +3,11 @@
3
  [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
4
  '''
5
 
6
- import os,re, requests, uuid, zipfile, hashlib, shutil
7
  import gradio as gr
8
  from bs4 import BeautifulSoup
9
  from urllib.parse import urljoin, urlparse
10
- from transformers import pipeline
11
  import torch
12
 
13
  # Function to validate URLs
@@ -23,34 +23,29 @@ def finder(url, soup, media_type):
23
  for tag in text_tags:
24
  for element in soup.find_all(tag):
25
  files.append(element.get_text())
26
- # Find links
27
- else:
28
- for link in soup.find_all('a'):
29
- file = link.get('href')
30
- if file and media_type in file:
31
- file_url = file
32
- if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
33
- file_url = urljoin(url, file_url)
34
- files.append(file_url)
35
  return files
36
 
37
- def summarize_long_text(text, chunk_size=1024):
38
  # Initialize the summarization pipeline
39
- summarizer = pipeline('summarization')
40
 
41
- # Tokenize the text into words
42
- words = text.split()
43
 
44
- # Split the words into chunks of the specified size
45
- chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
46
 
47
- # Summarize each chunk
48
- summarized_chunks = [summarizer(chunk, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks]
49
 
50
- # Combine the summarized chunks into the final summary
51
- final_summary = ' '.join(summarized_chunks)
 
 
 
 
52
 
53
- return final_summary
54
 
55
  def scrapper(url):
56
  try:
@@ -59,20 +54,17 @@ def scrapper(url):
59
  except (requests.exceptions.RequestException, ValueError) as e:
60
  raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
61
  return None
62
-
63
  soup = BeautifulSoup(response.content, 'html.parser')
64
-
65
  # Add text files to the text folder
66
  text_content = finder(url, soup, 'text')
67
  os.makedirs('text', exist_ok=True)
68
- full_text = ''
69
- if text_content:
70
- with open('text/content.txt', 'w') as text_file:
71
- for line in text_content:
72
- text_file.write(line + '\n')
73
- full_text += line + ' '
74
 
75
- # Initialize the summarization pipeline
 
 
 
 
76
  summary = summarize_long_text(full_text)
77
 
78
  return summary
 
3
  [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
4
  '''
5
 
6
+ import os, re, requests, uuid, zipfile, hashlib, shutil
7
  import gradio as gr
8
  from bs4 import BeautifulSoup
9
  from urllib.parse import urljoin, urlparse
10
+ from transformers import pipeline, AutoTokenizer
11
  import torch
12
 
13
  # Function to validate URLs
 
23
  for tag in text_tags:
24
  for element in soup.find_all(tag):
25
  files.append(element.get_text())
 
 
 
 
 
 
 
 
 
26
  return files
27
 
28
+ def summarize_long_text(text, model_name="facebook/bart-large-cnn", max_chunk_tokens=500):
29
  # Initialize the summarization pipeline
30
+ summarizer = pipeline('summarization', model=model_name)
31
 
32
+ # Initialize the tokenizer
33
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
34
 
35
+ # Tokenize the text
36
+ tokens = tokenizer.encode(text)
37
 
38
+ # Split the tokens into chunks of the specified size
39
+ chunks = [tokens[i:i + max_chunk_tokens] for i in range(0, len(tokens), max_chunk_tokens)]
40
 
41
+ # Summarize each chunk and combine the results
42
+ final_summary = ''
43
+ for chunk in chunks:
44
+ chunk_text = tokenizer.decode(chunk)
45
+ summary = summarizer(chunk_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
46
+ final_summary += ' ' + summary
47
 
48
+ return final_summary.strip()
49
 
50
  def scrapper(url):
51
  try:
 
54
  except (requests.exceptions.RequestException, ValueError) as e:
55
  raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
56
  return None
 
57
  soup = BeautifulSoup(response.content, 'html.parser')
 
58
  # Add text files to the text folder
59
  text_content = finder(url, soup, 'text')
60
  os.makedirs('text', exist_ok=True)
61
+ full_text = ' '.join(text_content) # Join the text content into a single string
 
 
 
 
 
62
 
63
+ # Save the full text to a file
64
+ with open('text/content.txt', 'w') as text_file:
65
+ text_file.write(full_text)
66
+
67
+ # Summarize the text
68
  summary = summarize_long_text(full_text)
69
 
70
  return summary