Spaces:

KevlarVK
/

content_summarizer

Runtime error

App Files Files Community

KevlarVK commited on Mar 5, 2023

Commit

a4f4f24

•

1 Parent(s): f1e08af

simple code to summarize using bart-large-cnn

Browse files

Files changed (3) hide show

Utils.py +35 -0
app.py +11 -2
summarize.py +50 -0

Utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import requests
+from bs4 import BeautifulSoup
+import string
+def fetch_article_text(url: str):
+    r = requests.get(url)
+    soup = BeautifulSoup(r.text, "html.parser")
+    results = soup.find_all(["h1", "p"])
+    text = [result.text for result in results]
+    ARTICLE = " ".join(text)
+    ARTICLE = ARTICLE.replace(".", ".<eos>")
+    ARTICLE = ARTICLE.replace("!", "!<eos>")
+    ARTICLE = ARTICLE.replace("?", "?<eos>")
+    sentences = ARTICLE.split("<eos>")
+    current_chunk = 0
+    chunks = []
+    for sentence in sentences:
+        if len(chunks) == current_chunk + 1:
+            if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
+                chunks[current_chunk].extend(sentence.split(" "))
+            else:
+                current_chunk += 1
+                chunks.append(sentence.split(" "))
+        else:
+            print(current_chunk)
+            chunks.append(sentence.split(" "))
+    for chunk_id in range(len(chunks)):
+        chunks[chunk_id] = " ".join(chunks[chunk_id])
+    return ARTICLE, chunks
+def count_tokens(text: str):
+    return len(text.split(" "))

app.py CHANGED Viewed

@@ -1,4 +1,13 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+from summarize import bart_summarize
+# Create a text field
+text = st.text_input("Enter text here")
+# Create a button
+button = st.button("Click here")
+# get text from text field and print it
+if button:
+    summary = bart_summarize(text)
+    st.write(summary)

summarize.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from datetime import datetime
+from transformers import BartTokenizer, TFBartForConditionalGeneration, pipeline
+from Utils import fetch_article_text, count_tokens
+import re
+from nltk.tokenize import sent_tokenize
+tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+def bart_summarize(text: str):
+    max_length = model.config.max_position_embeddings
+    sentences = sent_tokenize(text)
+    sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and len(sentence.split(" ")) > 4]
+    input_chunks = []
+    temp_sentences = ""
+    tokens = 0
+    for sentence in sentences:
+        if tokens + count_tokens(sentence) < max_length:
+            temp_sentences += sentence
+            tokens += count_tokens(sentence)
+        else:
+            input_chunks.append(temp_sentences)
+            tokens = count_tokens(sentence)
+            temp_sentences = sentence
+    if len(temp_sentences) > 0:
+        input_chunks.append(temp_sentences)
+    # summarize each input chunk separately
+    summaries = []
+    for chunk in input_chunks:
+        # encode the input chunk
+        encoded_input = tokenizer.encode(chunk, max_length=max_length, truncation=True, padding='longest', return_tensors='tf')
+        # generate summary for the input chunk
+        summary_ids = model.generate(encoded_input, max_length=300, num_beams=4, early_stopping=True)
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        # add the summary to the list of summaries
+        summaries.append(summary)
+    # # combine the summaries to get the final summary for the entire input
+    final_summary = " ".join(summaries)
+    return final_summary