import streamlit as st from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from transformers import pipeline from bs4 import BeautifulSoup import requests # app layout # st.set_page_config( page_title="Medium News App" ) ## FUNCTIONS ## # search medium urls function # @st.experimental_singleton def search_meduim_urls(monitored_tickers): search_url = "https://medium.com/tag/{}".format(monitored_tickers) r = requests.get(search_url) soup = BeautifulSoup(r.text, 'html.parser') # location where link to news is found(a tag with attribute "aria-label"= "Post Preview Title") # atags = soup.find_all('a', attrs={"aria-label": "Post Preview Title"}) hrefs = ['https://medium.com'+link['href'] for link in atags] return hrefs # funtion to search and scrape cleaned urls # @st.experimental_singleton def scrape_and_process(URLs): """ - function grabs all p-tags. - create list of whats in every p tag. - plit list into individual words, max 350. - make 1 corpus of data. - the length of each article tokens will be 350, because the max of the model i am using is 512 and i want the app to be faster. """ ARTICLES = [] for url in URLs: r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser') paragraphs = soup.find_all('p') text = [paragraph.text for paragraph in paragraphs] words = ' '.join(text).split(' ')[:350] ARTICLE = ' '.join(words) ARTICLES.append(ARTICLE) return ARTICLES #function to Summarise all Articles# @st.experimental_singleton def summarize(articles,_tokenizer,_model): """ encode , generate, decode, append to list """ summaries = [] for article in articles: input_ids = _tokenizer.encode(article, return_tensors='pt',max_length=512, truncation=True) output = _model.generate(input_ids, max_length=56, num_beams=5, early_stopping=True) summary = _tokenizer.decode(output[0], skip_special_tokens=True) summaries.append(summary) return summaries # function to load the transformer # @st.experimental_singleton def load_summary_transformer(): # load transformers # model_name = "facebook/bart-large-cnn" tokenizer_summary = AutoTokenizer.from_pretrained(model_name) model_summary = AutoModelForSeq2SeqLM.from_pretrained(model_name) return tokenizer_summary, model_summary # function to load sentiment pipeline # @st.experimental_singleton def load_sentiment_pipeline(): sentiment = pipeline('sentiment-analysis') return sentiment # function to create final output # def create_output_array(summaries, scores, urls): output = [] for ticker in monitored_tickers: for counter in range(len(summaries[ticker])): output_this = [ ticker, summaries[ticker][counter], scores[ticker][counter]['label'], scores[ticker][counter]['score'], urls[ticker][counter] ] output.append(output_this) return output # display summary output # def cards(title,score,sentiment,article,link): return f"""