{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import zipfile\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_corpus(json_file):\n",
    "    corpus = []\n",
    "    page_titles = []\n",
    "    \n",
    "    for json_obj in json_file:\n",
    "        corpus.append(json_obj['text'])\n",
    "        page_titles.append(json_obj['page'])\n",
    "\n",
    "\n",
    "    return (corpus, page_titles)\n",
    "\n",
    "with zipfile.ZipFile('resources/wiki_text_4.json.zip', 'r') as z:\n",
    "    with z.open('wiki_text_4.json') as f:\n",
    "        doc = json.load(f)\n",
    "#wiki dump is an json array of json objects with page and text fields \n",
    "# with open('resources/wiki_text_16.json') as f:\n",
    "#     doc = json.load(f)\n",
    "\n",
    "corpus, titles = create_corpus(doc)\n",
    "\n",
    "vectorizer = TfidfVectorizer()\n",
    "tfidf = vectorizer.fit_transform(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"An object orbiting this planet contains sections named Liberty, Equality, and Fraternity. A small group of clouds on this planet was nicknamed \\\"the Scooter\\\" for its high speed. Volcanoes that eject ice were first observed on an object that orbits this planet. The first high resolution images of this object were taken by Voyager 2 and revealed a storm system known as the \\\"Great Dark Spot\\\". Johann Galle first observed this planet from a telescope using predictions made by Urbain Le Verrier [\\\"ur-BAIN le vay-ree-AY\\\"] about its effects on the orbit of Uranus. For 10 points, name this dark blue gas giant, the outermost planet in the Solar System.\"\n",
    "tfidf_question = vectorizer.transform([question])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Methods_of_detecting_exoplanets\n"
     ]
    }
   ],
   "source": [
    "sim = cosine_similarity(tfidf, tfidf_question) \n",
    "\n",
    "#get index of best matching document and use it to get sim document \n",
    "sim_index = sim.argmax()\n",
    "sim_doc = corpus[sim_index]\n",
    "\n",
    "print(titles[sim_index])\n",
    "# titles"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}