Spaces:
Running
Running
import streamlit as st | |
from streamlit_extras.switch_page_button import switch_page | |
translations = { | |
'en': {'title': 'PLLaVA', | |
'original_tweet': | |
""" | |
[Original tweet](https://twitter.com/mervenoyann/status/1786336055425138939) (May 3, 2024) | |
""", | |
'tweet_1': | |
""" | |
Parameter-free LLaVA for video captioning works like magic! 🤩 Let's take a look! | |
""", | |
'tweet_2': | |
""" | |
Most of the video captioning models work by downsampling video frames to reduce computational complexity and memory requirements without losing a lot of information in the process. | |
PLLaVA on the other hand, uses pooling! 🤩 | |
<br> | |
How? 🧐 | |
It takes in frames of video, passed to ViT and then projection layer, and then output goes through average pooling where input shape is (# frames, width, height, text decoder input dim) 👇 | |
""", | |
'tweet_3': | |
""" | |
Pooling operation surprisingly reduces the loss of spatial and temporal information. See below some examples on how it can capture the details 🤗 | |
""", | |
'tweet_4': | |
""" | |
According to authors' findings, it performs way better than many of the existing models (including proprietary VLMs) and scales very well (on text decoder). | |
""", | |
'tweet_5': | |
""" | |
Model repositories 🤗 [7B](https://t.co/AeSdYsz1U7), [13B](https://t.co/GnI1niTxO7), [34B](https://t.co/HWAM0ZzvDc) | |
Spaces🤗 [7B](https://t.co/Oms2OLkf7O), [13B](https://t.co/C2RNVNA4uR) | |
""", | |
'ressources': | |
""" | |
Ressources: | |
[PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning](https://arxiv.org/abs/2404.16994) | |
by Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, Jiashi Feng (2024) | |
[GitHub](https://github.com/magic-research/PLLaVA) | |
""" | |
}, | |
'fr': { | |
'title': 'PLLaVA', | |
'original_tweet': | |
""" | |
[Tweet de base](https://twitter.com/mervenoyann/status/1786336055425138939) (en anglais) (3 mai 2024) | |
""", | |
'tweet_1': | |
""" | |
Parameter-free LLaVA (PLLaVA) pour le sous-titrage vidéo fonctionne comme par magie ! 🤩 | |
Jetons un coup d'œil ! | |
""", | |
'tweet_2': | |
""" | |
La plupart des modèles de sous-titrage vidéo fonctionnent par sous-échantillonnage des images vidéo afin de réduire la complexité de calcul et les besoins en mémoire sans perdre beaucoup d'informations au cours du processus. | |
PLLaVA, quant à lui, utilise le pooling ! 🤩 | |
<br> | |
Comment ? | |
Il prend les images de la vidéo, les passe au ViT puis à la couche de projection, et la sortie passe par un average pooling où la forme d'entrée est (# images, largeur, hauteur, dim d'entrée du décodeur de texte) 👇 """, | |
'tweet_3': | |
""" | |
L'opération de pooling réduit de manière surprenante la perte d'informations spatiales et temporelles. Voir ci-dessous quelques exemples de la façon dont elle peut capturer les détails 🤗 """, | |
'tweet_4': | |
""" | |
Selon les conclusions des auteurs, il est bien plus performant que de nombreux modèles existants (y compris les VLM propriétaires) et passe à l'échelle très bien (sur le décodeur de texte). """, | |
'tweet_5': | |
""" | |
Dépôts des modèles 🤗 [7 Mds](https://t.co/AeSdYsz1U7), [13 Mds](https://t.co/GnI1niTxO7), [34 Mds](https://t.co/HWAM0ZzvDc) | |
Spaces🤗 [7 Mds](https://t.co/Oms2OLkf7O), [13 Mds](https://t.co/C2RNVNA4uR) | |
""", | |
'ressources': | |
""" | |
Ressources : | |
[PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning](https://arxiv.org/abs/2404.16994) | |
de Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, Jiashi Feng (2024) | |
[GitHub](https://github.com/magic-research/PLLaVA) | |
""" | |
} | |
} | |
def language_selector(): | |
languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} | |
selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') | |
return 'en' if selected_lang == 'EN' else 'fr' | |
left_column, right_column = st.columns([5, 1]) | |
# Add a selector to the right column | |
with right_column: | |
lang = language_selector() | |
# Add a title to the left column | |
with left_column: | |
st.title(translations[lang]["title"]) | |
st.success(translations[lang]["original_tweet"], icon="ℹ️") | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/PLLaVA/image_1.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/PLLaVA/image_2.jpeg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/PLLaVA/image_3.jpeg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/PLLaVA/image_4.jpeg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.info(translations[lang]["ressources"], icon="📚") | |
st.markdown(""" """) | |
st.markdown(""" """) | |
st.markdown(""" """) | |
col1, col2, col3= st.columns(3) | |
with col1: | |
if lang == "en": | |
if st.button('Previous paper', use_container_width=True): | |
switch_page("MiniGemini") | |
else: | |
if st.button('Papier précédent', use_container_width=True): | |
switch_page("MiniGemini") | |
with col2: | |
if lang == "en": | |
if st.button("Home", use_container_width=True): | |
switch_page("Home") | |
else: | |
if st.button("Accueil", use_container_width=True): | |
switch_page("Home") | |
with col3: | |
if lang == "en": | |
if st.button("Next paper", use_container_width=True): | |
switch_page("CuMo") | |
else: | |
if st.button("Papier suivant", use_container_width=True): | |
switch_page("CuMo") |