sohail-shaikh-s07 commited on
Commit
78d5ce1
·
verified ·
1 Parent(s): 7b6d50e

App will work without and lxml error

Browse files
Files changed (1) hide show
  1. app.py +75 -39
app.py CHANGED
@@ -1,64 +1,100 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  from newspaper import Article
 
 
4
  import torch
5
 
6
- # Initialize the summarization pipeline
7
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
8
 
9
- # Define system prompt
10
- SYSTEM_PROMPT = """Summarize the given news article in a clear, concise, and accurate manner. Focus on:
11
- 1. Key facts and main points
12
- 2. Important details and context
13
- 3. Maintain objectivity
14
- 4. Preserve accuracy of information
15
- Avoid: opinions, redundancy, and unnecessary details."""
16
 
17
- def summarize_article(url):
18
  try:
19
- # Download and parse the article
20
  article = Article(url)
21
  article.download()
22
  article.parse()
 
23
 
24
- # Get the text content
25
  text = article.text
26
 
27
- # Prepend system prompt to the text
28
- text_with_prompt = SYSTEM_PROMPT + "\n\nArticle:\n" + text
29
-
30
- # If the text is too long, split it into chunks
31
  max_chunk_length = 1024
32
- chunks = [text_with_prompt[i:i + max_chunk_length] for i in range(0, len(text_with_prompt), max_chunk_length)]
33
 
 
34
  summaries = []
35
  for chunk in chunks:
36
- # Skip empty chunks
37
- if not chunk.strip():
38
- continue
39
-
40
- summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
41
- summaries.append(summary[0]['summary_text'])
42
 
43
- # Combine all summaries
44
  final_summary = " ".join(summaries)
45
- return final_summary
 
 
 
 
 
 
 
46
 
47
  except Exception as e:
48
- return f"Error processing the article: {str(e)}"
 
 
 
 
 
 
 
49
 
50
  # Create Gradio interface
51
- iface = gr.Interface(
52
- fn=summarize_article,
53
- inputs=gr.Textbox(label="Enter News Article URL", placeholder="https://..."),
54
- outputs=gr.Textbox(label="Summary"),
55
- title="News Article Summarizer",
56
- description="Enter a news article URL to get a concise summary. The summary will be clean, accurate, and focused on the main points.",
57
- examples=[
58
- ["https://www.bbc.com/news/world-us-canada-67841980"],
59
- ],
60
- theme=gr.themes.Soft()
61
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
64
- iface.launch()
 
1
  import gradio as gr
 
2
  from newspaper import Article
3
+ from transformers import pipeline
4
+ import nltk
5
  import torch
6
 
7
+ # Download required NLTK data
8
+ try:
9
+ nltk.data.find('tokenizers/punkt')
10
+ except LookupError:
11
+ nltk.download('punkt')
12
 
13
+ # Initialize the summarization pipeline
14
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
15
 
16
+ def extract_and_summarize(url):
17
  try:
18
+ # Download and parse article
19
  article = Article(url)
20
  article.download()
21
  article.parse()
22
+ article.nlp()
23
 
24
+ # Get the main text
25
  text = article.text
26
 
27
+ # If text is too long, split it into chunks
 
 
 
28
  max_chunk_length = 1024
29
+ chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
30
 
31
+ # Summarize each chunk
32
  summaries = []
33
  for chunk in chunks:
34
+ if len(chunk.strip()) > 100: # Only summarize chunks with substantial content
35
+ summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
36
+ summaries.append(summary[0]['summary_text'])
 
 
 
37
 
38
+ # Combine summaries
39
  final_summary = " ".join(summaries)
40
+
41
+ return {
42
+ "Title": article.title,
43
+ "Summary": final_summary,
44
+ "Keywords": ", ".join(article.keywords) if article.keywords else "No keywords available",
45
+ "Article Length": len(text),
46
+ "Summary Length": len(final_summary)
47
+ }
48
 
49
  except Exception as e:
50
+ return {
51
+ "Error": f"An error occurred: {str(e)}",
52
+ "Title": "Error",
53
+ "Summary": "Could not process the article",
54
+ "Keywords": "N/A",
55
+ "Article Length": 0,
56
+ "Summary Length": 0
57
+ }
58
 
59
  # Create Gradio interface
60
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
61
+ gr.Markdown("# 📰 News Article Summarizer")
62
+ gr.Markdown("Enter a news article URL to get a concise summary using BART-large-CNN model")
63
+
64
+ with gr.Row():
65
+ url_input = gr.Textbox(label="Article URL", placeholder="Enter news article URL here...")
66
+
67
+ with gr.Row():
68
+ submit_btn = gr.Button("Summarize", variant="primary")
69
+
70
+ with gr.Row():
71
+ output = gr.JSON(label="Results")
72
+
73
+ # Example URLs
74
+ gr.Examples(
75
+ examples=[
76
+ ["https://www.bbc.com/news/world-europe-65454331"],
77
+ ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
78
+ ],
79
+ inputs=url_input,
80
+ label="Example Articles"
81
+ )
82
+
83
+ submit_btn.click(
84
+ fn=extract_and_summarize,
85
+ inputs=url_input,
86
+ outputs=output
87
+ )
88
+
89
+ gr.Markdown("""
90
+ ## How it works
91
+ 1. Enter a URL of any news article
92
+ 2. The app extracts the main content using newspaper3k
93
+ 3. BART-large-CNN model summarizes the content
94
+ 4. Get a concise summary along with article metadata
95
+
96
+ Note: For very long articles, the text is split into chunks and summarized separately.
97
+ """)
98
 
99
  if __name__ == "__main__":
100
+ demo.launch()