wgcv commited on
Commit
b7a1a13
1 Parent(s): f65877b

first test

Browse files
Files changed (5) hide show
  1. app.py +69 -0
  2. loadhtml.py +33 -0
  3. model.py +84 -0
  4. requirements.txt +53 -0
  5. utils.py +13 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from utils import valid_url
4
+ from model import get_tidy_tab_t5, predict_model_t5
5
+ from model import get_tidy_tab_pegasus, predict_model_pegasus
6
+ from model import load_model_bart, predict_model_bart
7
+ from loadhtml import get_content
8
+
9
+ ## Site
10
+ st.title("Tab Recall Simplified 🚀")
11
+
12
+ st.markdown("Condense your Browser Tabs into a few impactful words. - Inspired in Arc Max Browser")
13
+
14
+ # load model
15
+
16
+
17
+
18
+ # Sidebar
19
+ st.sidebar.caption("Tidy Tabs - Title")
20
+ user_input_url = st.sidebar.text_input('Enter your url:')
21
+
22
+ error_message_url = None
23
+
24
+ def load_tab():
25
+ if(user_input_url):
26
+ # Error message state
27
+ if(error_message_url):
28
+ error_message_url.empty()
29
+ is_url_valid,url = valid_url(user_input_url)
30
+ if is_url_valid:
31
+ text, title = get_content(url)
32
+ if(text == ""):
33
+ print("error")
34
+ else:
35
+
36
+ with st.spinner('Wait for it...'):
37
+ st.sidebar.write(f'<title>: **{title}**')
38
+ time.sleep(1)
39
+ with st.spinner('Wait for it...'):
40
+ st.sidebar.write(f'T5-small: **{predict_model_t5(text)}**')
41
+ with st.spinner('Wait for it...'):
42
+ st.sidebar.write(f'Pegasus xsum: **{predict_model_pegasus(text)}**')
43
+ with st.spinner('Wait for it...'):
44
+ st.sidebar.write(f'Pegasus Bart: **{predict_model_bart(text)}**')
45
+ else:
46
+ error_message = st.sidebar.error(f'{text} is not a valid URL. Please enter a valid URL.')
47
+
48
+ button_clicked = st.sidebar.button("Load tab", on_click=load_tab())
49
+
50
+ st.sidebar.divider()
51
+
52
+
53
+ with st.status("Loading models...", expanded=True, state="complete") as models:
54
+ st.write("Loading https://huggingface.co/wgcv/tidy-tab-model-t5-small")
55
+ get_tidy_tab_t5()
56
+ st.write("Loaded T5-Small...")
57
+
58
+ st.write("Loaded from https://huggingface.co/wgcv/tidy-tab-model-pegasus-xsum")
59
+ get_tidy_tab_pegasus()
60
+ st.write("Loaded Pegasus xsum...")
61
+
62
+ st.write("Loaded from https://huggingface.co/wgcv/tidy-tab-model-bart-large-cnn")
63
+ load_model_bart()
64
+ st.write("Loaded Pegasus Bart-Large...")
65
+
66
+ models.update(label="All models loaded!", state="complete", expanded=False)
67
+
68
+
69
+
loadhtml.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ def get_content(url):
4
+ # Make a request to Prerender.io
5
+ response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'})
6
+
7
+ # Process the response
8
+ html_content = response.text
9
+
10
+ # Parse the HTML content
11
+ soup = BeautifulSoup(html_content, 'html.parser')
12
+
13
+ # Extract the title
14
+ title = soup.title.string if soup.title else ''
15
+
16
+ # Extract meta description
17
+ meta_description = soup.find('meta', attrs={'name': 'description'})
18
+ description = meta_description['content'] if meta_description else ''
19
+
20
+ # Extract headings
21
+ headings = [h.get_text() for h in soup.find_all(['h1', 'h2', 'h3'])]
22
+
23
+ # Extract main paragraphs
24
+ paragraphs = [p.get_text() for p in soup.find_all('p')]
25
+ headings = ' '.join(headings)
26
+ paragraphs = ' '.join(paragraphs)
27
+ headings = headings.replace("\n", "").replace("\t", "").replace(",", ";")
28
+ paragraphs = headings.replace("\n", "").replace("\t", "").replace(",", ";")
29
+ description = description.replace(",", ";")
30
+ title = title.replace(",", ";")
31
+ text = "[title] "+ title + "\n [description]" + description
32
+ # return {"url": url, "title":title, "description": description, "paragraphs": paragraphs, "headings":headings, "text": text , "summary": ""}
33
+ return text, title
model.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##
2
+ from transformers import AutoTokenizer, pipeline
3
+ from transformers import T5ForConditionalGeneration
4
+ from transformers import PegasusForConditionalGeneration
5
+ from transformers import BartForConditionalGeneration
6
+
7
+ import streamlit as st
8
+
9
+ # T5
10
+ def get_tidy_tab_t5():
11
+ if 'tidy_tab_t5' not in st.session_state:
12
+ st.session_state.tidy_tab_t5 = load_model_t5()
13
+ return st.session_state.tidy_tab_t5
14
+
15
+ def load_model_t5():
16
+ model_name="wgcv/tidy-tab-model-t5-small"
17
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
19
+ return pipeline('summarization', model=model, tokenizer=tokenizer)
20
+
21
+
22
+ def predict_model_t5(text):
23
+ tidy_tab_t5 = get_tidy_tab_t5()
24
+ if(tidy_tab_t5):
25
+ text = "summarize: " + text
26
+ result = tidy_tab_t5(text, max_length=8, min_length=1)
27
+ if(len(result)>0):
28
+ return result[0]['summary_text']
29
+ else:
30
+ return None
31
+ else:
32
+ return None
33
+
34
+
35
+ # pegasus-xsum
36
+ def get_tidy_tab_pegasus():
37
+ if 'tidy_tab_pegasus' not in st.session_state:
38
+ st.session_state.tidy_tab_pegasus = load_model_pegasus()
39
+ return st.session_state.tidy_tab_pegasus
40
+
41
+ def load_model_pegasus():
42
+ model_name="wgcv/tidy-tab-model-pegasus-xsum"
43
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
44
+ model = PegasusForConditionalGeneration.from_pretrained(model_name)
45
+ return pipeline('summarization', model=model, tokenizer=tokenizer)
46
+
47
+
48
+ def predict_model_pegasus(text):
49
+ tidy_tab_pegasus = get_tidy_tab_pegasus()
50
+ if(tidy_tab_pegasus):
51
+ text = text
52
+ result = tidy_tab_pegasus(text, max_length=8, min_length=1)
53
+ if(len(result)>0):
54
+ return result[0]['summary_text']
55
+ else:
56
+ return None
57
+ else:
58
+ return None
59
+
60
+
61
+ # Bart-Large
62
+ def get_tidy_tab_bart():
63
+ if 'tidy_tab_bart' not in st.session_state:
64
+ st.session_state.tidy_tab_bart = load_model_bart()
65
+ return st.session_state.tidy_tab_bart
66
+
67
+ def load_model_bart():
68
+ model_name="wgcv/tidy-tab-model-bart-large-cnn"
69
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
70
+ model = BartForConditionalGeneration.from_pretrained(model_name)
71
+ return pipeline('summarization', model=model, tokenizer=tokenizer)
72
+
73
+
74
+ def predict_model_bart(text):
75
+ tidy_tab_bart = get_tidy_tab_bart()
76
+ if(tidy_tab_bart):
77
+ text = text
78
+ result = tidy_tab_bart(text, num_beams=4, max_length=12, min_length=1, do_sample=True )
79
+ if(len(result)>0):
80
+ return result[0]['summary_text']
81
+ else:
82
+ return None
83
+ else:
84
+ return None
requirements.txt ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.3.0
2
+ attrs==23.2.0
3
+ beautifulsoup4==4.12.3
4
+ blinker==1.8.2
5
+ bs4==0.0.2
6
+ cachetools==5.3.3
7
+ certifi==2024.7.4
8
+ charset-normalizer==3.3.2
9
+ click==8.1.7
10
+ filelock==3.15.4
11
+ fsspec==2024.6.1
12
+ gitdb==4.0.11
13
+ GitPython==3.1.43
14
+ huggingface-hub==0.23.4
15
+ idna==3.7
16
+ Jinja2==3.1.4
17
+ jsonschema==4.23.0
18
+ jsonschema-specifications==2023.12.1
19
+ markdown-it-py==3.0.0
20
+ MarkupSafe==2.1.5
21
+ mdurl==0.1.2
22
+ numpy==1.26.4
23
+ packaging==24.1
24
+ pandas==2.2.2
25
+ pillow==10.4.0
26
+ protobuf==5.27.2
27
+ pyarrow==16.1.0
28
+ pydeck==0.9.1
29
+ Pygments==2.18.0
30
+ python-dateutil==2.9.0.post0
31
+ pytz==2024.1
32
+ PyYAML==6.0.1
33
+ referencing==0.35.1
34
+ regex==2024.5.15
35
+ requests==2.32.3
36
+ rich==13.7.1
37
+ rpds-py==0.19.0
38
+ safetensors==0.4.3
39
+ six==1.16.0
40
+ smmap==5.0.1
41
+ soupsieve==2.5
42
+ streamlit==1.36.0
43
+ tenacity==8.5.0
44
+ tokenizers==0.19.1
45
+ toml==0.10.2
46
+ toolz==0.12.1
47
+ tornado==6.4.1
48
+ tqdm==4.66.4
49
+ transformers==4.42.3
50
+ typing_extensions==4.12.2
51
+ tzdata==2024.1
52
+ urllib3==2.2.2
53
+ validators==0.31.0
utils.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.parse import urlparse
2
+ import validators
3
+
4
+ def valid_url(url):
5
+ try:
6
+ parsed_url = urlparse(url)
7
+ # Check if scheme is missing and prepend 'https://' if so
8
+ if not parsed_url.scheme:
9
+ url = 'https://' + url
10
+ # Validate the modified or original URL
11
+ return validators.url(url), url
12
+ except:
13
+ return False, url