Spaces:

sohail-shaikh-s07
/

News-Article-Summarization

Running

App Files Files

sohail-shaikh-s07 commited on Dec 21, 2024

Commit

78d5ce1

verified ·

1 Parent(s): 7b6d50e

App will work without and lxml error

Browse files

Files changed (1) hide show

app.py +75 -39

app.py CHANGED Viewed

@@ -1,64 +1,100 @@
 import gradio as gr
-from transformers import pipeline
 from newspaper import Article
 import torch
-# Initialize the summarization pipeline
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-# Define system prompt
-SYSTEM_PROMPT = """Summarize the given news article in a clear, concise, and accurate manner. Focus on:
-1. Key facts and main points
-2. Important details and context
-3. Maintain objectivity
-4. Preserve accuracy of information
-Avoid: opinions, redundancy, and unnecessary details."""
-def summarize_article(url):
     try:
-        # Download and parse the article
         article = Article(url)
         article.download()
         article.parse()
-        # Get the text content
         text = article.text
-        # Prepend system prompt to the text
-        text_with_prompt = SYSTEM_PROMPT + "\n\nArticle:\n" + text
-        # If the text is too long, split it into chunks
         max_chunk_length = 1024
-        chunks = [text_with_prompt[i:i + max_chunk_length] for i in range(0, len(text_with_prompt), max_chunk_length)]
         summaries = []
         for chunk in chunks:
-            # Skip empty chunks
-            if not chunk.strip():
-                continue
-            summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
-            summaries.append(summary[0]['summary_text'])
-        # Combine all summaries
         final_summary = " ".join(summaries)
-        return final_summary
     except Exception as e:
-        return f"Error processing the article: {str(e)}"
 # Create Gradio interface
-iface = gr.Interface(
-    fn=summarize_article,
-    inputs=gr.Textbox(label="Enter News Article URL", placeholder="https://..."),
-    outputs=gr.Textbox(label="Summary"),
-    title="News Article Summarizer",
-    description="Enter a news article URL to get a concise summary. The summary will be clean, accurate, and focused on the main points.",
-    examples=[
-        ["https://www.bbc.com/news/world-us-canada-67841980"],
-    ],
-    theme=gr.themes.Soft()
-)
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 from newspaper import Article
+from transformers import pipeline
+import nltk
 import torch
+# Download required NLTK data
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+# Initialize the summarization pipeline
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
+def extract_and_summarize(url):
     try:
+        # Download and parse article
         article = Article(url)
         article.download()
         article.parse()
+        article.nlp()
+        # Get the main text
         text = article.text
+        # If text is too long, split it into chunks
         max_chunk_length = 1024
+        chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
+        # Summarize each chunk
         summaries = []
         for chunk in chunks:
+            if len(chunk.strip()) > 100:  # Only summarize chunks with substantial content
+                summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
+                summaries.append(summary[0]['summary_text'])
+        # Combine summaries
         final_summary = " ".join(summaries)
+        return {
+            "Title": article.title,
+            "Summary": final_summary,
+            "Keywords": ", ".join(article.keywords) if article.keywords else "No keywords available",
+            "Article Length": len(text),
+            "Summary Length": len(final_summary)
+        }
     except Exception as e:
+        return {
+            "Error": f"An error occurred: {str(e)}",
+            "Title": "Error",
+            "Summary": "Could not process the article",
+            "Keywords": "N/A",
+            "Article Length": 0,
+            "Summary Length": 0
+        }
 # Create Gradio interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📰 News Article Summarizer")
+    gr.Markdown("Enter a news article URL to get a concise summary using BART-large-CNN model")
+    with gr.Row():
+        url_input = gr.Textbox(label="Article URL", placeholder="Enter news article URL here...")
+    with gr.Row():
+        submit_btn = gr.Button("Summarize", variant="primary")
+    with gr.Row():
+        output = gr.JSON(label="Results")
+    # Example URLs
+    gr.Examples(
+        examples=[
+            ["https://www.bbc.com/news/world-europe-65454331"],
+            ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
+        ],
+        inputs=url_input,
+        label="Example Articles"
+    )
+    submit_btn.click(
+        fn=extract_and_summarize,
+        inputs=url_input,
+        outputs=output
+    )
+    gr.Markdown("""
+    ## How it works
+    1. Enter a URL of any news article
+    2. The app extracts the main content using newspaper3k
+    3. BART-large-CNN model summarizes the content
+    4. Get a concise summary along with article metadata
+    Note: For very long articles, the text is split into chunks and summarized separately.
+    """)
 if __name__ == "__main__":
+    demo.launch()