Spaces:
Sleeping
Sleeping
first test
Browse files- app.py +69 -0
- loadhtml.py +33 -0
- model.py +84 -0
- requirements.txt +53 -0
- utils.py +13 -0
app.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from utils import valid_url
|
4 |
+
from model import get_tidy_tab_t5, predict_model_t5
|
5 |
+
from model import get_tidy_tab_pegasus, predict_model_pegasus
|
6 |
+
from model import load_model_bart, predict_model_bart
|
7 |
+
from loadhtml import get_content
|
8 |
+
|
9 |
+
## Site
|
10 |
+
st.title("Tab Recall Simplified 🚀")
|
11 |
+
|
12 |
+
st.markdown("Condense your Browser Tabs into a few impactful words. - Inspired in Arc Max Browser")
|
13 |
+
|
14 |
+
# load model
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
# Sidebar
|
19 |
+
st.sidebar.caption("Tidy Tabs - Title")
|
20 |
+
user_input_url = st.sidebar.text_input('Enter your url:')
|
21 |
+
|
22 |
+
error_message_url = None
|
23 |
+
|
24 |
+
def load_tab():
|
25 |
+
if(user_input_url):
|
26 |
+
# Error message state
|
27 |
+
if(error_message_url):
|
28 |
+
error_message_url.empty()
|
29 |
+
is_url_valid,url = valid_url(user_input_url)
|
30 |
+
if is_url_valid:
|
31 |
+
text, title = get_content(url)
|
32 |
+
if(text == ""):
|
33 |
+
print("error")
|
34 |
+
else:
|
35 |
+
|
36 |
+
with st.spinner('Wait for it...'):
|
37 |
+
st.sidebar.write(f'<title>: **{title}**')
|
38 |
+
time.sleep(1)
|
39 |
+
with st.spinner('Wait for it...'):
|
40 |
+
st.sidebar.write(f'T5-small: **{predict_model_t5(text)}**')
|
41 |
+
with st.spinner('Wait for it...'):
|
42 |
+
st.sidebar.write(f'Pegasus xsum: **{predict_model_pegasus(text)}**')
|
43 |
+
with st.spinner('Wait for it...'):
|
44 |
+
st.sidebar.write(f'Pegasus Bart: **{predict_model_bart(text)}**')
|
45 |
+
else:
|
46 |
+
error_message = st.sidebar.error(f'{text} is not a valid URL. Please enter a valid URL.')
|
47 |
+
|
48 |
+
button_clicked = st.sidebar.button("Load tab", on_click=load_tab())
|
49 |
+
|
50 |
+
st.sidebar.divider()
|
51 |
+
|
52 |
+
|
53 |
+
with st.status("Loading models...", expanded=True, state="complete") as models:
|
54 |
+
st.write("Loading https://huggingface.co/wgcv/tidy-tab-model-t5-small")
|
55 |
+
get_tidy_tab_t5()
|
56 |
+
st.write("Loaded T5-Small...")
|
57 |
+
|
58 |
+
st.write("Loaded from https://huggingface.co/wgcv/tidy-tab-model-pegasus-xsum")
|
59 |
+
get_tidy_tab_pegasus()
|
60 |
+
st.write("Loaded Pegasus xsum...")
|
61 |
+
|
62 |
+
st.write("Loaded from https://huggingface.co/wgcv/tidy-tab-model-bart-large-cnn")
|
63 |
+
load_model_bart()
|
64 |
+
st.write("Loaded Pegasus Bart-Large...")
|
65 |
+
|
66 |
+
models.update(label="All models loaded!", state="complete", expanded=False)
|
67 |
+
|
68 |
+
|
69 |
+
|
loadhtml.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
def get_content(url):
|
4 |
+
# Make a request to Prerender.io
|
5 |
+
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'})
|
6 |
+
|
7 |
+
# Process the response
|
8 |
+
html_content = response.text
|
9 |
+
|
10 |
+
# Parse the HTML content
|
11 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
12 |
+
|
13 |
+
# Extract the title
|
14 |
+
title = soup.title.string if soup.title else ''
|
15 |
+
|
16 |
+
# Extract meta description
|
17 |
+
meta_description = soup.find('meta', attrs={'name': 'description'})
|
18 |
+
description = meta_description['content'] if meta_description else ''
|
19 |
+
|
20 |
+
# Extract headings
|
21 |
+
headings = [h.get_text() for h in soup.find_all(['h1', 'h2', 'h3'])]
|
22 |
+
|
23 |
+
# Extract main paragraphs
|
24 |
+
paragraphs = [p.get_text() for p in soup.find_all('p')]
|
25 |
+
headings = ' '.join(headings)
|
26 |
+
paragraphs = ' '.join(paragraphs)
|
27 |
+
headings = headings.replace("\n", "").replace("\t", "").replace(",", ";")
|
28 |
+
paragraphs = headings.replace("\n", "").replace("\t", "").replace(",", ";")
|
29 |
+
description = description.replace(",", ";")
|
30 |
+
title = title.replace(",", ";")
|
31 |
+
text = "[title] "+ title + "\n [description]" + description
|
32 |
+
# return {"url": url, "title":title, "description": description, "paragraphs": paragraphs, "headings":headings, "text": text , "summary": ""}
|
33 |
+
return text, title
|
model.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##
|
2 |
+
from transformers import AutoTokenizer, pipeline
|
3 |
+
from transformers import T5ForConditionalGeneration
|
4 |
+
from transformers import PegasusForConditionalGeneration
|
5 |
+
from transformers import BartForConditionalGeneration
|
6 |
+
|
7 |
+
import streamlit as st
|
8 |
+
|
9 |
+
# T5
|
10 |
+
def get_tidy_tab_t5():
|
11 |
+
if 'tidy_tab_t5' not in st.session_state:
|
12 |
+
st.session_state.tidy_tab_t5 = load_model_t5()
|
13 |
+
return st.session_state.tidy_tab_t5
|
14 |
+
|
15 |
+
def load_model_t5():
|
16 |
+
model_name="wgcv/tidy-tab-model-t5-small"
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
18 |
+
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
19 |
+
return pipeline('summarization', model=model, tokenizer=tokenizer)
|
20 |
+
|
21 |
+
|
22 |
+
def predict_model_t5(text):
|
23 |
+
tidy_tab_t5 = get_tidy_tab_t5()
|
24 |
+
if(tidy_tab_t5):
|
25 |
+
text = "summarize: " + text
|
26 |
+
result = tidy_tab_t5(text, max_length=8, min_length=1)
|
27 |
+
if(len(result)>0):
|
28 |
+
return result[0]['summary_text']
|
29 |
+
else:
|
30 |
+
return None
|
31 |
+
else:
|
32 |
+
return None
|
33 |
+
|
34 |
+
|
35 |
+
# pegasus-xsum
|
36 |
+
def get_tidy_tab_pegasus():
|
37 |
+
if 'tidy_tab_pegasus' not in st.session_state:
|
38 |
+
st.session_state.tidy_tab_pegasus = load_model_pegasus()
|
39 |
+
return st.session_state.tidy_tab_pegasus
|
40 |
+
|
41 |
+
def load_model_pegasus():
|
42 |
+
model_name="wgcv/tidy-tab-model-pegasus-xsum"
|
43 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
44 |
+
model = PegasusForConditionalGeneration.from_pretrained(model_name)
|
45 |
+
return pipeline('summarization', model=model, tokenizer=tokenizer)
|
46 |
+
|
47 |
+
|
48 |
+
def predict_model_pegasus(text):
|
49 |
+
tidy_tab_pegasus = get_tidy_tab_pegasus()
|
50 |
+
if(tidy_tab_pegasus):
|
51 |
+
text = text
|
52 |
+
result = tidy_tab_pegasus(text, max_length=8, min_length=1)
|
53 |
+
if(len(result)>0):
|
54 |
+
return result[0]['summary_text']
|
55 |
+
else:
|
56 |
+
return None
|
57 |
+
else:
|
58 |
+
return None
|
59 |
+
|
60 |
+
|
61 |
+
# Bart-Large
|
62 |
+
def get_tidy_tab_bart():
|
63 |
+
if 'tidy_tab_bart' not in st.session_state:
|
64 |
+
st.session_state.tidy_tab_bart = load_model_bart()
|
65 |
+
return st.session_state.tidy_tab_bart
|
66 |
+
|
67 |
+
def load_model_bart():
|
68 |
+
model_name="wgcv/tidy-tab-model-bart-large-cnn"
|
69 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
70 |
+
model = BartForConditionalGeneration.from_pretrained(model_name)
|
71 |
+
return pipeline('summarization', model=model, tokenizer=tokenizer)
|
72 |
+
|
73 |
+
|
74 |
+
def predict_model_bart(text):
|
75 |
+
tidy_tab_bart = get_tidy_tab_bart()
|
76 |
+
if(tidy_tab_bart):
|
77 |
+
text = text
|
78 |
+
result = tidy_tab_bart(text, num_beams=4, max_length=12, min_length=1, do_sample=True )
|
79 |
+
if(len(result)>0):
|
80 |
+
return result[0]['summary_text']
|
81 |
+
else:
|
82 |
+
return None
|
83 |
+
else:
|
84 |
+
return None
|
requirements.txt
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.3.0
|
2 |
+
attrs==23.2.0
|
3 |
+
beautifulsoup4==4.12.3
|
4 |
+
blinker==1.8.2
|
5 |
+
bs4==0.0.2
|
6 |
+
cachetools==5.3.3
|
7 |
+
certifi==2024.7.4
|
8 |
+
charset-normalizer==3.3.2
|
9 |
+
click==8.1.7
|
10 |
+
filelock==3.15.4
|
11 |
+
fsspec==2024.6.1
|
12 |
+
gitdb==4.0.11
|
13 |
+
GitPython==3.1.43
|
14 |
+
huggingface-hub==0.23.4
|
15 |
+
idna==3.7
|
16 |
+
Jinja2==3.1.4
|
17 |
+
jsonschema==4.23.0
|
18 |
+
jsonschema-specifications==2023.12.1
|
19 |
+
markdown-it-py==3.0.0
|
20 |
+
MarkupSafe==2.1.5
|
21 |
+
mdurl==0.1.2
|
22 |
+
numpy==1.26.4
|
23 |
+
packaging==24.1
|
24 |
+
pandas==2.2.2
|
25 |
+
pillow==10.4.0
|
26 |
+
protobuf==5.27.2
|
27 |
+
pyarrow==16.1.0
|
28 |
+
pydeck==0.9.1
|
29 |
+
Pygments==2.18.0
|
30 |
+
python-dateutil==2.9.0.post0
|
31 |
+
pytz==2024.1
|
32 |
+
PyYAML==6.0.1
|
33 |
+
referencing==0.35.1
|
34 |
+
regex==2024.5.15
|
35 |
+
requests==2.32.3
|
36 |
+
rich==13.7.1
|
37 |
+
rpds-py==0.19.0
|
38 |
+
safetensors==0.4.3
|
39 |
+
six==1.16.0
|
40 |
+
smmap==5.0.1
|
41 |
+
soupsieve==2.5
|
42 |
+
streamlit==1.36.0
|
43 |
+
tenacity==8.5.0
|
44 |
+
tokenizers==0.19.1
|
45 |
+
toml==0.10.2
|
46 |
+
toolz==0.12.1
|
47 |
+
tornado==6.4.1
|
48 |
+
tqdm==4.66.4
|
49 |
+
transformers==4.42.3
|
50 |
+
typing_extensions==4.12.2
|
51 |
+
tzdata==2024.1
|
52 |
+
urllib3==2.2.2
|
53 |
+
validators==0.31.0
|
utils.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from urllib.parse import urlparse
|
2 |
+
import validators
|
3 |
+
|
4 |
+
def valid_url(url):
|
5 |
+
try:
|
6 |
+
parsed_url = urlparse(url)
|
7 |
+
# Check if scheme is missing and prepend 'https://' if so
|
8 |
+
if not parsed_url.scheme:
|
9 |
+
url = 'https://' + url
|
10 |
+
# Validate the modified or original URL
|
11 |
+
return validators.url(url), url
|
12 |
+
except:
|
13 |
+
return False, url
|