{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import arxiv\n",
    "\n",
    "client = arxiv.Client(delay_seconds=3, num_retries=3)\n",
    "\n",
    "\n",
    "max_results: int = 10\n",
    "\n",
    "search = arxiv.Search(\n",
    "    query=\"2304.08485\", \n",
    "    max_results=max_results, \n",
    "    sort_by=arxiv.SortCriterion.SubmittedDate\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "arxiv.Search(query='2304.08485', id_list=[], max_results=10, sort_by=<SortCriterion.SubmittedDate: 'submittedDate'>, sort_order=<SortOrder.Descending: 'descending'>)\n"
     ]
    }
   ],
   "source": [
    "print(search)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "entry_id:  http://arxiv.org/abs/2304.08485v2\n",
      "updated:  2023-12-11 17:46:14+00:00\n",
      "published:  2023-04-17 17:59:25+00:00\n",
      "title:  Visual Instruction Tuning\n",
      "authors:  [arxiv.Result.Author('Haotian Liu'), arxiv.Result.Author('Chunyuan Li'), arxiv.Result.Author('Qingyang Wu'), arxiv.Result.Author('Yong Jae Lee')]\n",
      "summary:  Instruction tuning large language models (LLMs) using machine-generated\n",
      "instruction-following data has improved zero-shot capabilities on new tasks,\n",
      "but the idea is less explored in the multimodal field. In this paper, we\n",
      "present the first attempt to use language-only GPT-4 to generate multimodal\n",
      "language-image instruction-following data. By instruction tuning on such\n",
      "generated data, we introduce LLaVA: Large Language and Vision Assistant, an\n",
      "end-to-end trained large multimodal model that connects a vision encoder and\n",
      "LLM for general-purpose visual and language understanding.Our early experiments\n",
      "show that LLaVA demonstrates impressive multimodel chat abilities, sometimes\n",
      "exhibiting the behaviors of multimodal GPT-4 on unseen images/instructions, and\n",
      "yields a 85.1% relative score compared with GPT-4 on a synthetic multimodal\n",
      "instruction-following dataset. When fine-tuned on Science QA, the synergy of\n",
      "LLaVA and GPT-4 achieves a new state-of-the-art accuracy of 92.53%. We make\n",
      "GPT-4 generated visual instruction tuning data, our model and code base\n",
      "publicly available.\n",
      "comment:  NeurIPS 2023 Oral; project page: https://llava-vl.github.io/\n",
      "journal_ref:  None\n",
      "doi:  None\n",
      "primary_category:  cs.CV\n",
      "categories:  ['cs.CV', 'cs.AI', 'cs.CL', 'cs.LG']\n",
      "links:  [arxiv.Result.Link('http://arxiv.org/abs/2304.08485v2', title=None, rel='alternate', content_type=None), arxiv.Result.Link('http://arxiv.org/pdf/2304.08485v2', title='pdf', rel='related', content_type=None)]\n",
      "pdf_url:  http://arxiv.org/pdf/2304.08485v2\n",
      "_raw:  {'id': 'http://arxiv.org/abs/2304.08485v2', 'guidislink': True, 'link': 'http://arxiv.org/abs/2304.08485v2', 'updated': '2023-12-11T17:46:14Z', 'updated_parsed': time.struct_time(tm_year=2023, tm_mon=12, tm_mday=11, tm_hour=17, tm_min=46, tm_sec=14, tm_wday=0, tm_yday=345, tm_isdst=0), 'published': '2023-04-17T17:59:25Z', 'published_parsed': time.struct_time(tm_year=2023, tm_mon=4, tm_mday=17, tm_hour=17, tm_min=59, tm_sec=25, tm_wday=0, tm_yday=107, tm_isdst=0), 'title': 'Visual Instruction Tuning', 'title_detail': {'type': 'text/plain', 'language': None, 'base': '', 'value': 'Visual Instruction Tuning'}, 'summary': 'Instruction tuning large language models (LLMs) using machine-generated\\ninstruction-following data has improved zero-shot capabilities on new tasks,\\nbut the idea is less explored in the multimodal field. In this paper, we\\npresent the first attempt to use language-only GPT-4 to generate multimodal\\nlanguage-image instruction-following data. By instruction tuning on such\\ngenerated data, we introduce LLaVA: Large Language and Vision Assistant, an\\nend-to-end trained large multimodal model that connects a vision encoder and\\nLLM for general-purpose visual and language understanding.Our early experiments\\nshow that LLaVA demonstrates impressive multimodel chat abilities, sometimes\\nexhibiting the behaviors of multimodal GPT-4 on unseen images/instructions, and\\nyields a 85.1% relative score compared with GPT-4 on a synthetic multimodal\\ninstruction-following dataset. When fine-tuned on Science QA, the synergy of\\nLLaVA and GPT-4 achieves a new state-of-the-art accuracy of 92.53%. We make\\nGPT-4 generated visual instruction tuning data, our model and code base\\npublicly available.', 'summary_detail': {'type': 'text/plain', 'language': None, 'base': '', 'value': 'Instruction tuning large language models (LLMs) using machine-generated\\ninstruction-following data has improved zero-shot capabilities on new tasks,\\nbut the idea is less explored in the multimodal field. In this paper, we\\npresent the first attempt to use language-only GPT-4 to generate multimodal\\nlanguage-image instruction-following data. By instruction tuning on such\\ngenerated data, we introduce LLaVA: Large Language and Vision Assistant, an\\nend-to-end trained large multimodal model that connects a vision encoder and\\nLLM for general-purpose visual and language understanding.Our early experiments\\nshow that LLaVA demonstrates impressive multimodel chat abilities, sometimes\\nexhibiting the behaviors of multimodal GPT-4 on unseen images/instructions, and\\nyields a 85.1% relative score compared with GPT-4 on a synthetic multimodal\\ninstruction-following dataset. When fine-tuned on Science QA, the synergy of\\nLLaVA and GPT-4 achieves a new state-of-the-art accuracy of 92.53%. We make\\nGPT-4 generated visual instruction tuning data, our model and code base\\npublicly available.'}, 'authors': [{'name': 'Haotian Liu'}, {'name': 'Chunyuan Li'}, {'name': 'Qingyang Wu'}, {'name': 'Yong Jae Lee'}], 'author_detail': {'name': 'Yong Jae Lee'}, 'author': 'Yong Jae Lee', 'arxiv_comment': 'NeurIPS 2023 Oral; project page: https://llava-vl.github.io/', 'links': [{'href': 'http://arxiv.org/abs/2304.08485v2', 'rel': 'alternate', 'type': 'text/html'}, {'title': 'pdf', 'href': 'http://arxiv.org/pdf/2304.08485v2', 'rel': 'related', 'type': 'application/pdf'}], 'arxiv_primary_category': {'term': 'cs.CV', 'scheme': 'http://arxiv.org/schemas/atom'}, 'tags': [{'term': 'cs.CV', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}, {'term': 'cs.AI', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}, {'term': 'cs.CL', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}, {'term': 'cs.LG', 'scheme': 'http://arxiv.org/schemas/atom', 'label': None}]}\n"
     ]
    }
   ],
   "source": [
    "results = []\n",
    "for result in client.results(search):\n",
    "    results.append(result)\n",
    "    # print all key value pairs in \"key:  value\" format\n",
    "    for key, value in vars(result).items():\n",
    "        print(f\"{key}:  {value}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}