import streamlit as st import pdfplumber, re from transformers import pipeline, AutoTokenizer st.set_page_config(page_title="Financial News Analyzer", page_icon="📰", layout="wide") @st.cache_resource(ttl=86400) # ───────────────── Cached pipelines ──────────────────────────────────── def load_pipes(): summarizer = pipeline("summarization", model=SUMM_MODEL) tokenizer = AutoTokenizer.from_pretrained(SUMM_MODEL) sentiment = pipeline("text-classification", model=SENT_MODEL) ner = pipeline("token-classification", model=NER_MODEL, aggregation_strategy="simple") return summarizer, tokenizer, sentiment, ner # ───────────────── Helper functions ──────────────────────────────────── # Read text from PDF def extract_pdf(file): text = "" with pdfplumber.open(file) as pdf: for page in pdf.pages: text += page.extract_text() or "" return text # Split text to avoid long contents def split_by_tokens(text, max_tokens): words = re.split(r"(\s+)", text) buf, n = "", 0 for w in words: ln = len(TOK(w).input_ids) if n + ln <= max_tokens: buf, n = buf + w, n + ln else: yield buf.strip() buf, n = w, ln if buf.strip(): yield buf.strip() # Summarise the news def summarise(text): parts = list(split_by_tokens(text, MAX_TOK)) per_len = max(25, min(80, TARGET_WORDS // max(1, len(parts)))) first_pass = [ SUMMAR(p, max_length=per_len, min_length=per_len // 2, do_sample=False)[0]["summary_text"] for p in parts ] joined = " ".join(first_pass) if len(joined.split()) > TARGET_WORDS: joined = SUMMAR(joined, max_length=TARGET_WORDS, min_length=TARGET_WORDS // 2, do_sample=False)[0]["summary_text"] return joined # Shorten the summary in 1-5 sentence (let user to choose) def shorten(summary, n_sentences): s = summary.split(". ") return (". ".join(s[:n_sentences]).rstrip(".") + ".") if len(s) > n_sentences else summary # Key entity tagging def tag_entities(text): tag_dict = {"Organization": [], "Person": [], "Location": [], "Miscellaneous": []} for entity in NER(text): group = {"ORG": "Organization", "PER": "Person", "LOC": "Location"}.get(entity["entity_group"], "Miscellaneous") tag_dict[group].append(entity["word"]) return {k: sorted(set(v)) for k, v in tag_dict.items() if v} # ───────────────── Main App Logic ──────────────────────────────────── def main(): st.title("📰 Financial News Analyzer") st.markdown("##### Instantly grasp news content, sentiment, and relevant entities") # Sidebar Input with st.sidebar: st.header("Input News to Analyze:") # Step 1: enter the news txt_input = st.text_area("Paste news article", height=150) pdf_file = st.file_uploader("Or upload PDF", type=["pdf"]) # let user choose summary length (1-5 sentence) sent_count = st.slider("Summary length (sentences)", min_value=1, max_value=5, value=3) run_btn = st.button("🔍 Analyze", use_container_width=True) raw_text = extract_pdf(pdf_file) if pdf_file else txt_input.strip() # Main processing if run_btn: if not raw_text: st.warning("Please provide text or a PDF first.") st.stop() with st.spinner("Analyzing..."): full_sum = summarise(raw_text) summary = shorten(full_sum, sent_count) # Step 2: Summarization cols = st.columns([2, 1]) with cols[0]: st.subheader("📝 Summary") st.markdown(f"
{summary}
", unsafe_allow_html=True) # Step 3: Sentiment analysis with cols[1]: result = SENT_CLF(summary)[0] label = LABEL_MAP.get(result["label"], result["label"]) color = COLOR_MAP[label] st.subheader("📊 Sentiment") st.markdown( f"

{label}

" f"{result['score'] * 100:.1f}% Confidence

", unsafe_allow_html=True ) # Step 3: Entity Tags tags = tag_entities(summary) st.subheader("🏷️ Relevant Tags") if tags: # Tag pill CSS pill_css = """ """ st.markdown(pill_css, unsafe_allow_html=True) for category, values in tags.items(): st.markdown(f"
{category}
", unsafe_allow_html=True) pills = "".join(f"{v}" for v in values) st.markdown(pills, unsafe_allow_html=True) else: st.info("No entities detected.") # ───────────────── Main Part ─────────────────────────────────────── # models and other constant variables SUMM_MODEL = "sshleifer/distilbart-cnn-12-6" SENT_MODEL = "nynn/Fintuned_Sentiment" NER_MODEL = "Babelscape/wikineural-multilingual-ner" SUMMAR, TOK, SENT_CLF, NER = load_pipes() MAX_TOK = 1024 TARGET_WORDS = 225 LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive", "LABEL_2": "Neutral"} COLOR_MAP = {"Positive": "green", "Negative": "red", "Neutral": "gray"} if __name__ == "__main__": main()