<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="LLaSM: Large Language and Speech Model">
  <meta name="keywords" content="speech-language, multi-modal, LLM, LLaSM">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>LLaSM: Large Language and Speech Model</title>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/favicon.svg">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>

  <!-- for LLaSM demo -->
  <link rel="stylesheet" href="./static/css/bootsrap.min.css">
  <link rel="stylesheet" href="./static/css/styles.css">
  <script src="./static/js/recorder.mp3.min.js"></script>
  <script src="./static/js/waveview.js"></script> 
  <!-- / for LLaSM demo -->
</head>
<body>

<nav class="navbar" role="navigation" aria-label="main navigation">
  <div class="navbar-brand">
    <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
    </a>
  </div>
  <div class="navbar-menu">
    <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
      <a class="navbar-item" href="https://keunhong.com">
      <span class="icon">

          <i class="fas fa-home"></i>
      </span>
      </a>

      <div class="navbar-item has-dropdown is-hoverable">
        <a class="navbar-link">
          More Research
        </a>
        <div class="navbar-dropdown">
          <a class="navbar-item" href="https://huggingface.co/spaces/LinkSoul/Chinese-LLaVA" target="_blank">
            Chinese-LLaVA
          </a>
          <a class="navbar-item" href="https://huggingface.co/LinkSoul/Chinese-Llama-2-7b" target="_blank">
            Chinese-Llama-2-7B
          </a>
        </div>
      </div>
    </div>

  </div>
</nav>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">LLaSM: Large Language and Speech Model</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block" style="color:#008AD7;font-weight:normal;">
              Yu Shu<sup>2</sup>,</span>
            <span class="author-block" style="color:#008AD7;font-weight:normal;">
              Siwei Dong<sup>2</sup>,</span>
            <span class="author-block" style="color:#ed2f09;font-weight:normal;">
              Guangyao Chen<sup>1,3</sup>,
            </span>
            <span class="author-block" style="color:#cc00d7;font-weight:normal;">
              Wenhao Huang<sup>4</sup>,
            </span>
            <span class="author-block" style="color:#19e706;font-weight:normal;">
              Rita Zhang,
              <!-- Rita Zhang<sup>5</sup>, -->
            </span>
            <span class="author-block" style="color:#19e706;font-weight:normal;">
              Daochen Shi,
              <!-- Daochen Shi<sup>5</sup>, -->
            </span>
            <span class="author-block" style="color:#19e706;font-weight:normal;">
              Qiqi Xiang,
              <!-- Qiqi Xiang<sup>5</sup>, -->
            </span>
            <span class="author-block" style="color:#f68946;font-weight:normal;">
              Yemin Shi<sup>1*</sup>
            </span>
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block" style="color:#f68946;font-weight:normal;"><sup>1</sup>LinkSoul.AI,</span>
            <span class="author-block" style="color:#008AD7;font-weight:normal;"><sup>2</sup>Beijing Academy of Artificial Intelligence, China,</span>
            <span class="author-block" style="color:#ed2f09;font-weight:normal;"><sup>3</sup>Peking University, China</span>
            <span class="author-block" style="color:#cc00d7;font-weight:normal;"><sup>4</sup>01.ai</span>
          </div>
          
          <div>
            <span class="author-block"><sup>*</sup>Corresponding author: ymshi@linksoul.ai</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <!-- <span class="link-block">
                <a href="" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span> -->
              <span class="link-block">
                <a href="https://arxiv.org/abs/2308.15930" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Model Link. -->
              <span class="link-block">
                <a href="https://huggingface.co/LinkSoul/LLaSM-Cllama2" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-atom"></i>
                  </span>
                  <span>Model</span>
                </a>
              </span>
              <!-- Code Link. -->
              <span class="link-block">
                <a href="https://github.com/LinkSoul-AI/LLaSM" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
              <!-- Dataset Link. -->
              <span class="link-block">
                <a href="https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="far fa-images"></i>
                  </span>
                  <span>Data</span>
                  </a>
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Multi-modal large language models have garnered significant interest recently. Though,
            most of the works are focusing on vision-language multi-modal models providing
            strong capabilities in following vision-and-language instructions. However, we claim
            that speech is also an important modality through which human interact with the
            world. Hence, it is crucial for a general-purpose assistant to be able to follow multi-modal
            speech-and-language instructions. In this work, we propose <b>L</b>arge <b>L</b>anguage
            <b>a</b>nd <b>S</b>peech <b>M</b>odel (<b>LLaSM</b>). <b>LLaSM</b> is an end-to-end trained large multi-modal
            speech-language model with cross-modal conversational abilities, capable of following
            speech-and-language instructions. Our early experiments show that <b>LLaSM</b> demonstrates
            a more convenient and natural way for human to interact with artificial intelligence.
            Specifically, we also release a large Speech Instruction Following data set <b>LLaSM-Audio-Instruction</b>.
          </p>
          <p>
            Our paper makes the following contributions:
          </p>
          <ui>
            <li>
              We build a speech-language multi-modal assistant that can understand and follow the speech-language instructions, which provides a more convenient and natural way for humans to interact with artificial intelligence.
            </li>
            <li>
              We construct and release <a href="https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions" target="_blank">LLaSM-Audio-Instructions</a>, a large scale Chinese and English speech-text cross-modal instruction following dataset.
            </li>
            <li>
              We release the code in <a href="https://github.com/LinkSoul-AI/LLaSM" target="_blank">https://github.com/LinkSoul-AI/LLaSM</a>.
            </li>
            <li>
              We release the models in <a href="https://huggingface.co/LinkSoul/LLaSM-Cllama2" target="_blank">LLaSM-Chinese-Llama-2-7B</a> and <a href="https://huggingface.co/LinkSoul/LLaSM-Baichuan" target="_blank">LLaSM-Baichuan-7B</a>.
            </li>
          </ui>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->
  </div>
</section>

<section class="hero is-light is-small">
  <div class="hero-body">
    <h2 class="title is-3" style="text-align: center;">Demo</h2>
    <!-- LLaSM Demo -->
    <div id="llasaLoading" style="position: absolute; width: 100%; z-index: 1; display: flex; justify-content: center; align-items: center;">
      <div style="text-align: center;">
        <img src="./images/duck.gif" alt="loading" />
        <h3>Loading...</h3>
      </div>
    </div>
    <div class="container" id="llasa" style="opacity: 0;">
			<div class="row mt-5 justify-content-center">
			  <div class="col-md-12 mt-3">
          <div id="chat-window" class="card p-2">
            <div class="container my-3">
              <!-- <div id="info"></div> -->
              <div id="results" class="results">
      
              </div>
              <fieldset id="temp_audio" style="text-align: center; height: 100px; border: 1.4px solid #ddd;">
                <legend style="float: initial;
                      text-align: initial;
                      width: initial;
                      margin-left: 10px;
                      font-size: initial;">Audio preview</legend>
                <div id="waveform" style="text-align: center; height: 50px; width: 100%;"></div>
                <audio id="audioPlayer" style="height: 50px; width: 100%; display: none; padding: 0 20px 0 20px;" controls src=""></audio>
              </fieldset>
            </div>
          </div>
		
          <div id="user-input" class="mt-2">
            <div class="input-group">
              <textarea type="text" id="user-text" style="height: 60px; padding: 10px 150px 5px 10px;" placeholder="Type in your message or press record button to speak..."></textarea>
              <div id="input-audio" class="input-group-append p-2">
                <button id="delete_button" class="mb-2 p-2">
                <img id="delete_img" class="mb-2" src="images/error.png" alt="Del">
                </button>
                <button id="start_button" class="mb-2 p-2">
                <img id="start_img" class="mb-2" src="images/microphone.png" alt="Record">
                </button>
                <button id="send_button" class="mb-2 p-2">
                <img id="send_text_img" class="mb-2" src="images/paper-plane.png" alt="Start">
                </button>
              </div>
            </div>
				  </div>
			  </div>
			</div>
    </div>

    <!-- / LLaSM Demo -->
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <!-- Demo Tips. -->
    <div class="columns is-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Tips</h2>
        <div class="content has-text-justified">
          <h4>
            Demo 试用教程
          </h4>
          <ul>
            <li>
              文本框输入文字,点击最右侧发送按钮即可发送消息,开始聊天。
            </li>
            <li>
              点击语音按钮,开始录音,再次点击,结束录音。点击发送按钮,即可发送语音消息。
            </li>
            <li>
              语音未发送之前可在音频预览区检查,聊天框中的历史语音消息同样支持回放。
            </li>
            <li>
              点击重置按钮可清空历史对话信息。
            </li>
            <li>
              注:本 demo 仅作为 LLaSM 的模型能力展示,对多轮对话中话题切换支持不足。切换聊天话题时,建议清空历史以获得更好的体验。
            </li>
          </ul>
        </div>
      </div>
    </div>
    <!--/ Demo Tips. -->
  </div>
</section>

<section class="section" id="BibTeX">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-four-fifths">
        <h2 class="title">BibTeX</h2>
        <pre>
          <code>
@misc{shu2023llasm,
      title={LLaSM: Large Language and Speech Model}, 
      author={Yu Shu and Siwei Dong and Guangyao Chen and Wenhao Huang and Ruihua Zhang and Daochen Shi and Qiqi Xiang and Yemin Shi},
      year={2023},
      eprint={2308.15930},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
          </code>
        </pre>
      </div>
    </div>
  </div>
</section>

<section class="section" id="Acknowledgement">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-four-fifths">
        <h2 class="title">Acknowledgement</h2>
        <p>
          This website is adapted from <a href="https://github.com/nerfies/nerfies.github.io" target="_blank">Nerfies</a>, licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>. We thank the open-source projects for giving us access to their models, including <a href="https://huggingface.co/LinkSoul/Chinese-Llama-2-7b" target="_blank">Chinese-Llama-2-7B</a> and <a href="https://huggingface.co/openai/whisper-large-v2" target="_blank">Whisper</a> and <a href="https://huggingface.co/baichuan-inc/Baichuan-7B" target="_blank">Baichuan-7B</a>.
        </p>
      </div>
    </div>
  </div>
</section>

<!-- for LLaSM demo -->
<script src="./static/js/index_demo.js"></script>
<!-- / for LLaSM demo -->
</body>
</html>