SpeedWatch / app.py
mbCrypto's picture
Update app.py
edbb831
raw
history blame
5.06 kB
import gradio as gr
from transformers import pipeline, PegasusForConditionalGeneration
# Initialize the space
summarizeryt = pipeline("summarization", model="facebook/bart-large-cnn")
summarizerbg = pipeline("summarization")
summarizertx = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
# How to use: YTVideoToText("https://www.youtube.com/watch?v=jQL0ZeHtXFc")
def YTVideoToText(video_link):
# installing & importing libraries
from youtube_transcript_api import YouTubeTranscriptApi
# fetching video transcript
video_id = video_link.split("=")[1]
transcript = YouTubeTranscriptApi.get_transcript(video_id)
# iterating throughout and adding all text together
result = ""
for i in transcript:
result += ' ' + i['text']
# summarize text
num_iters = int(len(result)/1000)
summarized_text = []
summarized_text2 = []
for i in range(0, num_iters + 1):
start = 0
start = i * 1000
end = (i + 1) * 1000
out = summarizeryt(result[start:end], max_new_tokens=130, min_length=30, do_sample=False)
out = out[0]
out = out['summary_text']
summarized_text.append(out)
summarized_text2 = ' '.join(summarized_text)
# returning summary
return [result, summarized_text2];
# How to use: postSummaryWithBart("https://ethereum.org/en/what-is-ethereum/")
def postSummaryWithBart(blog_link):
# importing libraries
from bs4 import BeautifulSoup
import requests
# getting our blog post
URL = blog_link
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all(['h1', 'p'])
text = [result.text for result in results]
ARTICLE = ' '.join(text)
# replacing punctuations with end-of-sentence tags
ARTICLE = ARTICLE.replace('.', '.')
ARTICLE = ARTICLE.replace('?', '?')
ARTICLE = ARTICLE.replace('!', '!')
sentences = ARTICLE.split('')
# chunking text
max_chunk = 500
current_chunk = 0
chunks = []
for sentence in sentences:
# checking if we have an empty chunk
if len(chunks) == current_chunk + 1:
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
chunks[current_chunk].extend(sentence.split(' '))
else:
current_chunk += 1
chunks.append(sentence.split(' '))
else:
print(current_chunk)
chunks.append(sentence.split(' '))
for chunk_id in range(len(chunks)):
chunks[chunk_id] = ' '.join(chunks[chunk_id])
# summarizing text
res = summarizerbg(chunks, max_new_tokens=1024, min_length=30, do_sample=False)
text = ''.join([summ['summary_text'] for summ in res])
# returning summary
return [sentences, text];
# How to use: abstractiveSummaryWithPegasus("""Sample text to be summarized""")
def abstractiveSummaryWithPegasus(words):
# importing & loading model
from transformers import PegasusTokenizer
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
# perform summarization
tokens = tokenizer(words, truncation=True, padding="longest", return_tensors="pt")
summary = summarizertx.generate(**tokens)
actual_summ = tokenizer.decode(summary[0])
# returning summary
return actual_summ
# Main UI
with gr.Blocks() as ui:
gr.Markdown("""
## Permet de faire le résumé d'une video youtube ou d'un article de blog
""")
with gr.Row():
with gr.Column():
URI = gr.Textbox(
label="URI à résumer",
max_lines=1,
placeholder="https://youtube|website.ext",
api_name="uri"
)
TRANSCRIPT = gr.Textbox(
label="Transcript à résumer",
lines=10,
placeholder="https://youtube|website.ext",
api_name="transcript"
)
RESUME = gr.Textbox(
label="Résumé",
lines=10,
interactive=False,
placeholder="https://youtube|website.ext",
api_name="resume"
)
with gr.Column():
gr.Button("Process Youtube").click(
fn=YTVideoToText,
inputs=[URI],
outputs=[TRANSCRIPT, RESUME],
api_name="process_uri"
)
gr.Button("Process HTML").click(
fn=postSummaryWithBart,
inputs=[URI],
outputs=[TRANSCRIPT, RESUME],
api_name="process_uri"
)
gr.Button("Process TEXT").click(
fn=abstractiveSummaryWithPegasus,
inputs=[TRANSCRIPT],
outputs=[RESUME],
api_name="process_text"
)
#translator_fr = gr.Interface.load("huggingface/Helsinki-NLP/opus-mt-fr-en")
#summarizer = gr.Interface.load("huggingface/sshleifer/distilbart-cnn-12-6")
ui.launch()