Spaces:
Sleeping
Sleeping
Commit
·
e38196e
1
Parent(s):
d804491
Update app.py
Browse files
app.py
CHANGED
@@ -3,11 +3,11 @@
|
|
3 |
[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
|
4 |
'''
|
5 |
|
6 |
-
import os,re, requests, uuid, zipfile, hashlib, shutil
|
7 |
import gradio as gr
|
8 |
from bs4 import BeautifulSoup
|
9 |
from urllib.parse import urljoin, urlparse
|
10 |
-
from transformers import pipeline
|
11 |
import torch
|
12 |
|
13 |
# Function to validate URLs
|
@@ -23,34 +23,29 @@ def finder(url, soup, media_type):
|
|
23 |
for tag in text_tags:
|
24 |
for element in soup.find_all(tag):
|
25 |
files.append(element.get_text())
|
26 |
-
# Find links
|
27 |
-
else:
|
28 |
-
for link in soup.find_all('a'):
|
29 |
-
file = link.get('href')
|
30 |
-
if file and media_type in file:
|
31 |
-
file_url = file
|
32 |
-
if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
|
33 |
-
file_url = urljoin(url, file_url)
|
34 |
-
files.append(file_url)
|
35 |
return files
|
36 |
|
37 |
-
def summarize_long_text(text,
|
38 |
# Initialize the summarization pipeline
|
39 |
-
summarizer = pipeline('summarization')
|
40 |
|
41 |
-
#
|
42 |
-
|
43 |
|
44 |
-
#
|
45 |
-
|
46 |
|
47 |
-
#
|
48 |
-
|
49 |
|
50 |
-
#
|
51 |
-
final_summary = '
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
return final_summary
|
54 |
|
55 |
def scrapper(url):
|
56 |
try:
|
@@ -59,20 +54,17 @@ def scrapper(url):
|
|
59 |
except (requests.exceptions.RequestException, ValueError) as e:
|
60 |
raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
|
61 |
return None
|
62 |
-
|
63 |
soup = BeautifulSoup(response.content, 'html.parser')
|
64 |
-
|
65 |
# Add text files to the text folder
|
66 |
text_content = finder(url, soup, 'text')
|
67 |
os.makedirs('text', exist_ok=True)
|
68 |
-
full_text = ''
|
69 |
-
if text_content:
|
70 |
-
with open('text/content.txt', 'w') as text_file:
|
71 |
-
for line in text_content:
|
72 |
-
text_file.write(line + '\n')
|
73 |
-
full_text += line + ' '
|
74 |
|
75 |
-
#
|
|
|
|
|
|
|
|
|
76 |
summary = summarize_long_text(full_text)
|
77 |
|
78 |
return summary
|
|
|
3 |
[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
|
4 |
'''
|
5 |
|
6 |
+
import os, re, requests, uuid, zipfile, hashlib, shutil
|
7 |
import gradio as gr
|
8 |
from bs4 import BeautifulSoup
|
9 |
from urllib.parse import urljoin, urlparse
|
10 |
+
from transformers import pipeline, AutoTokenizer
|
11 |
import torch
|
12 |
|
13 |
# Function to validate URLs
|
|
|
23 |
for tag in text_tags:
|
24 |
for element in soup.find_all(tag):
|
25 |
files.append(element.get_text())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
return files
|
27 |
|
28 |
+
def summarize_long_text(text, model_name="facebook/bart-large-cnn", max_chunk_tokens=500):
|
29 |
# Initialize the summarization pipeline
|
30 |
+
summarizer = pipeline('summarization', model=model_name)
|
31 |
|
32 |
+
# Initialize the tokenizer
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
34 |
|
35 |
+
# Tokenize the text
|
36 |
+
tokens = tokenizer.encode(text)
|
37 |
|
38 |
+
# Split the tokens into chunks of the specified size
|
39 |
+
chunks = [tokens[i:i + max_chunk_tokens] for i in range(0, len(tokens), max_chunk_tokens)]
|
40 |
|
41 |
+
# Summarize each chunk and combine the results
|
42 |
+
final_summary = ''
|
43 |
+
for chunk in chunks:
|
44 |
+
chunk_text = tokenizer.decode(chunk)
|
45 |
+
summary = summarizer(chunk_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
|
46 |
+
final_summary += ' ' + summary
|
47 |
|
48 |
+
return final_summary.strip()
|
49 |
|
50 |
def scrapper(url):
|
51 |
try:
|
|
|
54 |
except (requests.exceptions.RequestException, ValueError) as e:
|
55 |
raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
|
56 |
return None
|
|
|
57 |
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
58 |
# Add text files to the text folder
|
59 |
text_content = finder(url, soup, 'text')
|
60 |
os.makedirs('text', exist_ok=True)
|
61 |
+
full_text = ' '.join(text_content) # Join the text content into a single string
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
+
# Save the full text to a file
|
64 |
+
with open('text/content.txt', 'w') as text_file:
|
65 |
+
text_file.write(full_text)
|
66 |
+
|
67 |
+
# Summarize the text
|
68 |
summary = summarize_long_text(full_text)
|
69 |
|
70 |
return summary
|