Spaces:

wgcv
/

Tidy-Tabs-Titles

Sleeping

App Files Files Community

wgcv commited on Jul 9, 2024

Commit

b7a1a13

1 Parent(s): f65877b

first test

Browse files

Files changed (5) hide show

app.py +69 -0
loadhtml.py +33 -0
model.py +84 -0
requirements.txt +53 -0
utils.py +13 -0

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import streamlit as st
+import time
+from utils import valid_url
+from model import get_tidy_tab_t5, predict_model_t5
+from model import get_tidy_tab_pegasus, predict_model_pegasus
+from model import load_model_bart, predict_model_bart
+from loadhtml import get_content
+## Site
+st.title("Tab Recall Simplified 🚀")
+st.markdown("Condense your Browser Tabs into a few impactful words. - Inspired in Arc Max Browser")
+# load model
+# Sidebar
+st.sidebar.caption("Tidy Tabs - Title")
+user_input_url = st.sidebar.text_input('Enter your url:')
+error_message_url = None
+def load_tab():
+    if(user_input_url):
+        # Error message state
+        if(error_message_url):
+            error_message_url.empty()
+        is_url_valid,url = valid_url(user_input_url)
+        if is_url_valid:
+            text, title = get_content(url)
+            if(text == ""):
+                print("error")
+            else:
+                with st.spinner('Wait for it...'):
+                    st.sidebar.write(f'&lt;title&gt;: **{title}**')
+                    time.sleep(1)
+                with st.spinner('Wait for it...'):
+                    st.sidebar.write(f'T5-small: **{predict_model_t5(text)}**')
+                with st.spinner('Wait for it...'):
+                    st.sidebar.write(f'Pegasus xsum: **{predict_model_pegasus(text)}**')
+                with st.spinner('Wait for it...'):
+                    st.sidebar.write(f'Pegasus Bart: **{predict_model_bart(text)}**')
+        else:
+            error_message = st.sidebar.error(f'{text} is not a valid URL. Please enter a valid URL.')
+button_clicked = st.sidebar.button("Load tab", on_click=load_tab())
+st.sidebar.divider()
+with st.status("Loading models...", expanded=True, state="complete") as models:
+    st.write("Loading https://huggingface.co/wgcv/tidy-tab-model-t5-small")
+    get_tidy_tab_t5()
+    st.write("Loaded T5-Small...")
+    st.write("Loaded from https://huggingface.co/wgcv/tidy-tab-model-pegasus-xsum")
+    get_tidy_tab_pegasus()
+    st.write("Loaded Pegasus xsum...")
+    st.write("Loaded from https://huggingface.co/wgcv/tidy-tab-model-bart-large-cnn")
+    load_model_bart()
+    st.write("Loaded Pegasus Bart-Large...")
+    models.update(label="All models loaded!", state="complete", expanded=False)

loadhtml.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import requests
+from bs4 import BeautifulSoup
+def get_content(url):
+  # Make a request to Prerender.io
+  response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'})
+  # Process the response
+  html_content = response.text
+  # Parse the HTML content
+  soup = BeautifulSoup(html_content, 'html.parser')
+  # Extract the title
+  title = soup.title.string if soup.title else ''
+  # Extract meta description
+  meta_description = soup.find('meta', attrs={'name': 'description'})
+  description = meta_description['content'] if meta_description else ''
+  # Extract headings
+  headings = [h.get_text() for h in soup.find_all(['h1', 'h2', 'h3'])]
+  # Extract main paragraphs
+  paragraphs = [p.get_text() for p in soup.find_all('p')]
+  headings = ' '.join(headings)
+  paragraphs = ' '.join(paragraphs)
+  headings = headings.replace("\n", "").replace("\t", "").replace(",", ";")
+  paragraphs = headings.replace("\n", "").replace("\t", "").replace(",", ";")
+  description = description.replace(",", ";")
+  title = title.replace(",", ";")
+  text = "[title] "+ title + "\n [description]" + description
+  # return {"url": url, "title":title, "description": description,  "paragraphs": paragraphs, "headings":headings, "text": text , "summary": ""}
+  return text, title

model.py ADDED Viewed

	@@ -0,0 +1,84 @@

+##
+from transformers import AutoTokenizer, pipeline
+from transformers import T5ForConditionalGeneration
+from transformers import PegasusForConditionalGeneration
+from transformers import BartForConditionalGeneration
+import streamlit as st
+# T5
+def get_tidy_tab_t5():
+    if 'tidy_tab_t5' not in st.session_state:
+        st.session_state.tidy_tab_t5 = load_model_t5()
+    return st.session_state.tidy_tab_t5
+def load_model_t5():
+    model_name="wgcv/tidy-tab-model-t5-small"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = T5ForConditionalGeneration.from_pretrained(model_name)
+    return pipeline('summarization', model=model, tokenizer=tokenizer)
+def predict_model_t5(text):
+    tidy_tab_t5 = get_tidy_tab_t5()
+    if(tidy_tab_t5):
+        text = "summarize: " + text
+        result = tidy_tab_t5(text, max_length=8, min_length=1)
+        if(len(result)>0):
+            return result[0]['summary_text']
+        else:
+            return None
+    else:
+        return None
+# pegasus-xsum
+def get_tidy_tab_pegasus():
+    if 'tidy_tab_pegasus' not in st.session_state:
+        st.session_state.tidy_tab_pegasus = load_model_pegasus()
+    return st.session_state.tidy_tab_pegasus
+def load_model_pegasus():
+    model_name="wgcv/tidy-tab-model-pegasus-xsum"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = PegasusForConditionalGeneration.from_pretrained(model_name)
+    return pipeline('summarization', model=model, tokenizer=tokenizer)
+def predict_model_pegasus(text):
+    tidy_tab_pegasus = get_tidy_tab_pegasus()
+    if(tidy_tab_pegasus):
+        text =  text
+        result = tidy_tab_pegasus(text, max_length=8, min_length=1)
+        if(len(result)>0):
+            return result[0]['summary_text']
+        else:
+            return None
+    else:
+        return None
+# Bart-Large
+def get_tidy_tab_bart():
+    if 'tidy_tab_bart' not in st.session_state:
+        st.session_state.tidy_tab_bart = load_model_bart()
+    return st.session_state.tidy_tab_bart
+def load_model_bart():
+    model_name="wgcv/tidy-tab-model-bart-large-cnn"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = BartForConditionalGeneration.from_pretrained(model_name)
+    return pipeline('summarization', model=model, tokenizer=tokenizer)
+def predict_model_bart(text):
+    tidy_tab_bart = get_tidy_tab_bart()
+    if(tidy_tab_bart):
+        text =  text
+        result = tidy_tab_bart(text, num_beams=4, max_length=12, min_length=1, do_sample=True  )
+        if(len(result)>0):
+            return result[0]['summary_text']
+        else:
+            return None
+    else:
+        return None

requirements.txt ADDED Viewed

	@@ -0,0 +1,53 @@

+altair==5.3.0
+attrs==23.2.0
+beautifulsoup4==4.12.3
+blinker==1.8.2
+bs4==0.0.2
+cachetools==5.3.3
+certifi==2024.7.4
+charset-normalizer==3.3.2
+click==8.1.7
+filelock==3.15.4
+fsspec==2024.6.1
+gitdb==4.0.11
+GitPython==3.1.43
+huggingface-hub==0.23.4
+idna==3.7
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+numpy==1.26.4
+packaging==24.1
+pandas==2.2.2
+pillow==10.4.0
+protobuf==5.27.2
+pyarrow==16.1.0
+pydeck==0.9.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.19.0
+safetensors==0.4.3
+six==1.16.0
+smmap==5.0.1
+soupsieve==2.5
+streamlit==1.36.0
+tenacity==8.5.0
+tokenizers==0.19.1
+toml==0.10.2
+toolz==0.12.1
+tornado==6.4.1
+tqdm==4.66.4
+transformers==4.42.3
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+validators==0.31.0

utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from urllib.parse import urlparse
+import validators
+def valid_url(url):
+    try:
+        parsed_url = urlparse(url)
+        # Check if scheme is missing and prepend 'https://' if so
+        if not parsed_url.scheme:
+            url = 'https://' + url
+        # Validate the modified or original URL
+        return validators.url(url), url
+    except:
+        return False, url