Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.tokenize import word_tokenize | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| import io | |
| from collections import Counter | |
| import string | |
| import os | |
| from nltk.stem import PorterStemmer | |
| # Download NLTK resources | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| # Ensure NLTK data is downloaded at runtime | |
| nltk_data_path = "/home/user/nltk_data" | |
| if not os.path.exists(nltk_data_path): | |
| os.makedirs(nltk_data_path) | |
| nltk.data.path.append(nltk_data_path) | |
| nltk.download('punkt', download_dir=nltk_data_path) | |
| # Initialize lemmatizer | |
| lemmatizer = WordNetLemmatizer() | |
| # Load models (cache them to avoid reloading on every interaction) | |
| def load_classification_model(): | |
| model_name = "Imasha17/News_classification.4" # Replace with your model path | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| return pipeline("text-classification", model=model, tokenizer=tokenizer) | |
| def load_qa_model(): | |
| return pipeline("question-answering", model="deepset/roberta-base-squad2") | |
| # Function to generate word cloud | |
| def generate_wordcloud(text, title=None): | |
| wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) | |
| plt.figure(figsize=(10, 5)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis("off") | |
| plt.title(title, fontsize=20) | |
| st.pyplot(plt) | |
| # Set page config with an attractive icon and layout options | |
| st.set_page_config( | |
| page_title="News Analysis Dashboard", | |
| page_icon="📰", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS to improve styling | |
| st.markdown(""" | |
| <style> | |
| .reportview-container { | |
| background: #f0f2f6; | |
| } | |
| /* Header styling */ | |
| .header { | |
| background: linear-gradient(90deg, #1a73e8, #4285f4); | |
| padding: 20px; | |
| border-radius: 8px; | |
| margin-bottom: 20px; | |
| text-align: center; | |
| color: white; | |
| } | |
| .header h1 { | |
| font-size: 48px; | |
| margin: 0; | |
| font-weight: bold; | |
| } | |
| /* Sidebar styling */ | |
| .css-1d391kg { | |
| background-color: #ffffff; | |
| } | |
| /* Button styling */ | |
| .stButton>button { | |
| background-color: #1a73e8; | |
| color: white; | |
| border: none; | |
| padding: 10px 20px; | |
| border-radius: 5px; | |
| font-size: 16px; | |
| } | |
| .stButton>button:hover { | |
| background-color: #0c55b3; | |
| } | |
| /* Text input styling */ | |
| .stTextInput>div>div>input { | |
| background-color: #ffffff; | |
| color: #333333; | |
| font-size: 16px; | |
| } | |
| /* Card style containers */ | |
| .card { | |
| background-color: #ffffff; | |
| padding: 20px; | |
| border-radius: 8px; | |
| margin-bottom: 20px; | |
| box-shadow: 0px 4px 8px rgba(0,0,0,0.05); | |
| colour:#1a73e8; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Banner header | |
| st.markdown(""" | |
| <div class="header"> | |
| <h1>News Content Analyzer</h1> | |
| <p style="font-size: 20px; margin-top: 5px;">Analyze, classify, and explore news content with AI</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Layout introduction text | |
| st.markdown(""" | |
| <div class="card"> | |
| <h2 style="color:#1a73e8;">Welcome!</h2> | |
| <p style="color:#1a73e8;">This dashboard allows you to: | |
| <ul style="color:#1a73e8;"> | |
| <li>Classify news articles into categories</li> | |
| <li>Ask questions about the news content</li> | |
| <li>Visualize sentiment, entities, and summaries</li> | |
| </ul> | |
| Use the tabs below to navigate between different functionalities. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Create tabs for different functionalities | |
| tab1, tab2, tab3 = st.tabs(["News Classification", "Ask Questions", "Advanced Features"]) | |
| with tab1: | |
| st.markdown('<div class="card">', unsafe_allow_html=True) | |
| st.header("News Classification ") | |
| st.write("Upload a CSV file containing news excerpts to classify them into categories.") | |
| # File uploader with a descriptive message | |
| uploaded_file = st.file_uploader("Choose a CSV file (must contain a 'content' column)", type="csv") | |
| if uploaded_file is None: | |
| st.warning("Please upload a CSV file to get started.") | |
| else: | |
| df = pd.read_csv(uploaded_file) | |
| #Preview Uploaded Data | |
| st.subheader("Preview Uploaded Data") | |
| st.dataframe(df.head(5)) | |
| # Load the fine-tuned news classifier | |
| classifier = pipeline("text-classification", model="Imasha17/News_classification.4") | |
| # Preprocessing steps | |
| df["cleaned_content"] = df["content"].str.lower() | |
| # Remove URLs | |
| def remove_urls(text): | |
| url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']') | |
| return url_pattern.sub(r'', text).strip() | |
| df["cleaned_content"] = df["cleaned_content"].apply(remove_urls) | |
| # Remove Emails | |
| def remove_emails(text): | |
| email_pattern = re.compile(r'\S+@\S+') | |
| return email_pattern.sub(r'', text) | |
| df["cleaned_content"] = df["cleaned_content"].apply(remove_emails) | |
| # Remove punctuation | |
| def remove_punctuation(text): | |
| return "".join([char for char in text if char not in string.punctuation]) | |
| df["cleaned_content"] = df["cleaned_content"].apply(remove_punctuation) | |
| # Remove stopwords | |
| stop_words = set(stopwords.words('english')) | |
| def remove_stopwords(text): | |
| return " ".join([word for word in text.split() if word not in stop_words]) | |
| df["cleaned_content"] = df["cleaned_content"].apply(remove_stopwords) | |
| # Remove special characters | |
| def remove_special_characters(text): | |
| return re.sub(r'[^A-Za-z\s]', '', text) | |
| df["cleaned_content"] = df["cleaned_content"].apply(remove_special_characters) | |
| # Remove frequent words | |
| word_count = Counter(df["cleaned_content"].str.split(expand=True).stack()) | |
| common_words = set([word for (word, count) in word_count.most_common(10)]) | |
| def remove_common_words(text): | |
| return " ".join([word for word in text.split() if word not in common_words]) | |
| df["cleaned_content"] = df["cleaned_content"].apply(remove_common_words) | |
| # Remove rare words | |
| rare_words = set([word for (word, count) in word_count.most_common()[:-20-1:-1]]) | |
| def remove_rare_words(text): | |
| return " ".join([word for word in text.split() if word not in rare_words]) | |
| df["cleaned_content"] = df["cleaned_content"].apply(remove_rare_words) | |
| # Tokenize and stem | |
| df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split()) | |
| stemmer = PorterStemmer() | |
| def stem_tokens(tokens): | |
| return [stemmer.stem(token) for token in tokens] | |
| df['stemmed_content'] = df['tokenized_content'].apply(stem_tokens) | |
| df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text)) | |
| # Classify each article and store predictions | |
| df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"]) | |
| # Word Cloud Visualization | |
| def create_wordcloud(text_data): | |
| text = ' '.join(text_data) | |
| wordcloud = WordCloud(width=800, height=400).generate(text) | |
| plt.figure(figsize=(10, 5)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| st.pyplot(plt) | |
| st.subheader("Word Cloud of News Content") | |
| create_wordcloud(df['preprocessed_content']) | |
| # Keep only necessary columns | |
| df = df[['content','Class']] | |
| #show Classification Results | |
| st.subheader("Classification Results") | |
| st.write(df) | |
| #show class distribution | |
| st.subheader("Class Distribution") | |
| class_dist = df['Class'].value_counts() | |
| st.bar_chart(class_dist) | |
| #download csv file | |
| st.subheader("Download Results") | |
| csv = df.to_csv(index=False).encode('utf-8') | |
| st.download_button( | |
| label="Download output.csv", | |
| data=csv, | |
| file_name='output.csv', | |
| mime='text/csv' | |
| ) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab2: | |
| st.markdown('<div class="card">', unsafe_allow_html=True) | |
| st.header("Ask Questions Based on Uploaded News Content File") | |
| st.write("Ask questions about news content and get answers from our AI model.") | |
| #check file is uploaded | |
| if uploaded_file is not None: | |
| context = ' '.join(df['content'].tolist()) | |
| st.write(f"Loaded {len(df)} news excerpts") | |
| else: | |
| st.warning("Please upload a CSV file.") | |
| #generate the answer based on uloaded news content file using the given model | |
| question = st.text_input("Enter your question:") | |
| if st.button("Get Answer"): | |
| #check for file available | |
| if uploaded_file is None: | |
| st.error("Please upload a CSV file before asking a question.") | |
| elif context and question: | |
| with st.spinner("Searching for answers..."): | |
| #load the model for Q&A pipline | |
| qa_pipeline = load_qa_model() | |
| result = qa_pipeline(question=question, context=context) | |
| st.subheader("Answer") | |
| st.success(result['answer']) | |
| st.subheader("Details") | |
| st.write(f"Confidence: {result['score']:.2f}") | |
| else: | |
| st.error("Please enter a question.") | |
| #generate the answer based on selected news content using the given model | |
| st.markdown("---") | |
| st.header("Ask Questions Based on Your News Content") | |
| context_1 = st.text_area("Enter News Content", height=100) | |
| question_1 = st.text_input("Enter your question:", key="question_input") | |
| if st.button("Get Answer", key="get_answer_1"): | |
| #check for selected context and question are available | |
| if context_1 and question_1: | |
| qa_pipeline = load_qa_model() | |
| answer_1 = qa_pipeline(question=question_1, context=context_1) | |
| st.success(f"Answer: {answer_1['answer']}") | |
| else: | |
| st.warning("Provide both context and question.") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with tab3: | |
| st.markdown('<div class="card">', unsafe_allow_html=True) | |
| st.header("Advanced Features") | |
| st.write("Explore additional functionalities to enhance your news analysis.") | |
| # Named Entity Recognition of news content | |
| st.subheader("Named Entity Recognition Of News Content") | |
| ner_text = st.text_area("Enter News Content for entity recognition:", height=100) | |
| if st.button("Extract Entities"): | |
| with st.spinner("Identifying entities..."): | |
| #load the model | |
| ner_pipeline = pipeline("ner", grouped_entities=True) | |
| results = ner_pipeline(ner_text) | |
| entities = [] | |
| for entity in results: | |
| entities.append({ | |
| "Entity": entity['entity_group'], | |
| "Word": entity['word'], | |
| "Score": entity['score'] | |
| }) | |
| st.table(pd.DataFrame(entities)) | |
| # Text Summarization | |
| st.subheader("News Content Summarization") | |
| summary_text = st.text_area("Enter news content to summarize:", height=150) | |
| if st.button("Generate Summary"): | |
| with st.spinner("Generating summary..."): | |
| #load the summarization model | |
| summarizer = pipeline("summarization") | |
| summary = summarizer(summary_text, max_length=130, min_length=30) | |
| st.write(summary[0]['summary_text']) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # Sentiment Analysis | |
| st.subheader("News Tone Detector") | |
| sentiment_text = st.text_area("Enter text for news content analysis:", height=100) | |
| if st.button("Analyze Tone"): | |
| with st.spinner("Analyzing sentiment..."): | |
| #load the model | |
| sentiment_pipeline = pipeline("sentiment-analysis") | |
| result = sentiment_pipeline(sentiment_text)[0] | |
| st.write(f"Label: {result['label']}") | |
| st.write(f"Confidence: {result['score']:.2f}") | |
| if result['label'] == 'POSITIVE': | |
| st.success("This text appears positive!") | |
| else: | |
| st.warning("This text appears negative.") | |
| # Enhanced Sidebar with branding and instructions | |
| with st.sidebar: | |
| st.image("news_logo.jpg", width=300) | |
| st.title("About") | |
| st.write(""" | |
| This app helps analyze news content: | |
| - Classify news into categories | |
| - Answer questions about news content | |
| - Perform advanced text analysis | |
| """) | |
| st.title("Instructions") | |
| st.write(""" | |
| 1. Upload a CSV file with a 'content' column. | |
| 2. Click on the appropriate tab to use a feature. | |
| 3. Download results as CSV. | |
| 4. Use the Q&A tab to ask questions about the news. | |
| """) | |
| st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.4)") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown("<div style='text-align: center;'>© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers</div>", unsafe_allow_html=True) | |