{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import arxiv\n", "\n", "client = arxiv.Client(delay_seconds=3, num_retries=3)\n", "\n", "\n", "max_results: int = 10\n", "\n", "search = arxiv.Search(\n", " query=\"2304.08485\", \n", " max_results=max_results, \n", " sort_by=arxiv.SortCriterion.SubmittedDate\n", " )" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "arxiv.Search(query='2304.08485', id_list=[], max_results=10, sort_by=, sort_order=)\n" ] } ], "source": [ "print(search)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "entry_id: http://arxiv.org/abs/2304.08485v2\n", "updated: 2023-12-11 17:46:14+00:00\n", "published: 2023-04-17 17:59:25+00:00\n", "title: Visual Instruction Tuning\n", "authors: [arxiv.Result.Author('Haotian Liu'), arxiv.Result.Author('Chunyuan Li'), arxiv.Result.Author('Qingyang Wu'), arxiv.Result.Author('Yong Jae Lee')]\n", "summary: Instruction tuning large language models (LLMs) using machine-generated\n", "instruction-following data has improved zero-shot capabilities on new tasks,\n", "but the idea is less explored in the multimodal field. In this paper, we\n", "present the first attempt to use language-only GPT-4 to generate multimodal\n", "language-image instruction-following data. By instruction tuning on such\n", "generated data, we introduce LLaVA: Large Language and Vision Assistant, an\n", "end-to-end trained large multimodal model that connects a vision encoder and\n", "LLM for general-purpose visual and language understanding.Our early experiments\n", "show that LLaVA demonstrates impressive multimodel chat abilities, sometimes\n", "exhibiting the behaviors of multimodal GPT-4 on unseen images/instructions, and\n", "yields a 85.1% relative score compared with GPT-4 on a synthetic multimodal\n", "instruction-following dataset. When fine-tuned on Science QA, the synergy of\n", "LLaVA and GPT-4 achieves a new state-of-the-art accuracy of 92.53%. We make\n", "GPT-4 generated visual instruction tuning data, our model and code base\n", "publicly available.\n", "comment: NeurIPS 2023 Oral; project page: https://llava-vl.github.io/\n", "journal_ref: None\n", "doi: None\n", "primary_category: cs.CV\n", "categories: ['cs.CV', 'cs.AI', 'cs.CL', 'cs.LG']\n", "links: [arxiv.Result.Link('http://arxiv.org/abs/2304.08485v2', title=None, rel='alternate', content_type=None), arxiv.Result.Link('http://arxiv.org/pdf/2304.08485v2', title='pdf', rel='related', content_type=None)]\n", "pdf_url: http://arxiv.org/pdf/2304.08485v2\n", "_raw: {'id': 'http://arxiv.org/abs/2304.08485v2', 'guidislink': True, 'link': 'http://arxiv.org/abs/2304.08485v2', 'updated': '2023-12-11T17:46:14Z', 'updated_parsed': time.struct_time(tm_year=2023, tm_mon=12, tm_mday=11, tm_hour=17, tm_min=46, tm_sec=14, tm_wday=0, tm_yday=345, tm_isdst=0), 'published': '2023-04-17T17:59:25Z', 'published_parsed': time.struct_time(tm_year=2023, tm_mon=4, tm_mday=17, tm_hour=17, tm_min=59, tm_sec=25, tm_wday=0, tm_yday=107, tm_isdst=0), 'title': 'Visual Instruction Tuning', 'title_detail': {'type': 'text/plain', 'language': None, 'base': '', 'value': 'Visual Instruction Tuning'}, 'summary': 'Instruction tuning large language models (LLMs) using machine-generated\\ninstruction-following data has improved zero-shot capabilities on new tasks,\\nbut the idea is less explored in the multimodal field. In this paper, we\\npresent the first attempt to use language-only GPT-4 to generate multimodal\\nlanguage-image instruction-following data. By instruction tuning on such\\ngenerated data, we introduce LLaVA: Large Language and Vision Assistant, an\\nend-to-end trained large multimodal model that connects a vision encoder and\\nLLM for general-purpose visual and language understanding.Our early experiments\\nshow that LLaVA demonstrates impressive multimodel chat abilities, sometimes\\nexhibiting the behaviors of multimodal GPT-4 on unseen images/instructions, and\\nyields a 85.1% relative score compared with GPT-4 on a synthetic multimodal\\ninstruction-following dataset. When fine-tuned on Science QA, the synergy of\\nLLaVA and GPT-4 achieves a new state-of-the-art accuracy of 92.53%. We make\\nGPT-4 generated visual instruction tuning data, our model and code base\\npublicly available.', 'summary_detail': {'type': 'text/plain', 'language': None, 'base': '', 'value': 'Instruction tuning large language models (LLMs) using machine-generated\\ninstruction-following data has improved zero-shot capabilities on new tasks,\\nbut the idea is less explored in the multimodal field. In this paper, we\\npresent the first attempt to use language-only GPT-4 to generate multimodal\\nlanguage-image instruction-following data. By instruction tuning on such\\ngenerated data, we introduce LLaVA: Large Language and Vision Assistant, an\\nend-to-end trained large multimodal model that connects a vision encoder and\\nLLM for general-purpose visual and language understanding.Our early experiments\\nshow that LLaVA demonstrates impressive multimodel chat abilities, sometimes\\nexhibiting the behaviors of multimodal GPT-4 on unseen images/instructions, and\\nyields a 85.1% relative score compared with GPT-4 on a synthetic multimodal\\ninstruction-following dataset. When fine-tuned on Science QA, the synergy of\\nLLaVA and GPT-4 achieves a new state-of-the-art accuracy of 92.53%. We make\\nGPT-4 generated visual instruction tuning data, our model and code base\\npublicly available.'}, 'authors': [{'name': 'Haotian Liu'}, {'name': 'Chunyuan Li'}, {'name': 'Qingyang Wu'}, {'name': 'Yong Jae Lee'}], 'author_detail': {'name': 'Yong Jae Lee'}, 'author': 'Yong Jae Lee', 'arxiv_comment': 'NeurIPS 2023 Oral; project page: https://llava-vl.github.io/', 'links': [{'href': 'http://arxiv.org/abs/2304.08485v2', 'rel': 'alternate', 'type': 'text/html'}, {'title': 'pdf', 'href': 'http://arxiv.org/pdf/2304.08485v2', 'rel': 'related', 'type': 'application/pdf'}], 'arxiv_primary_category': {'term': 'cs.CV', 'scheme': 'http://arxiv.org/schemas/atom'}, 'tags': [{'term': 'cs.CV', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}, {'term': 'cs.AI', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}, {'term': 'cs.CL', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}, {'term': 'cs.LG', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}]}\n" ] } ], "source": [ "results = []\n", "for result in client.results(search):\n", " results.append(result)\n", " # print all key value pairs in \"key: value\" format\n", " for key, value in vars(result).items():\n", " print(f\"{key}: {value}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }