{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bert_load_from_file: gguf version     = 2\n",
      "bert_load_from_file: gguf alignment   = 32\n",
      "bert_load_from_file: gguf data offset = 695552\n",
      "bert_load_from_file: model name           = BERT\n",
      "bert_load_from_file: model architecture   = bert\n",
      "bert_load_from_file: model file type      = 1\n",
      "bert_load_from_file: bert tokenizer vocab = 30522\n",
      "[0.01552767027169466, 0.08103805035352707, -0.12307794392108917, 0.09815496951341629, 0.023653453215956688, -0.06102974712848663, 0.07934562116861343, 0.02745242230594158, -0.028132867068052292, 0.03221212700009346, 0.12919503450393677, 0.0025996030308306217, -0.04139482602477074, -0.06577245146036148, -0.014648980461061, 0.015588296577334404, -0.08434717357158661, -0.07182654738426208, 0.014775916934013367, -0.07444048672914505, 0.0590442530810833, 0.04814479872584343, 0.06639457494020462, 0.008800982497632504, -0.017847837880253792, -0.020949387922883034, -0.026810096576809883, 0.026885343715548515, -0.0764176994562149, -0.057069629430770874, 0.039454489946365356, 0.06288687884807587, 0.036681558936834335, 0.03875448554754257, 0.09926188737154007, 0.07691209763288498, -0.0007747725467197597, -0.05224066600203514, -0.06268111616373062, -0.00026997251552529633, 0.06668399274349213, -0.10031015425920486, -0.00970512256026268, -0.01601257175207138, -0.03624574467539787, -0.10884801298379898, -0.027961881831288338, -0.02198118157684803, 0.011900517158210278, -0.005993946921080351, -0.08890494704246521, -0.01797824539244175, -0.040237877517938614, -0.049093399196863174, -0.019428042694926262, -0.005168401636183262, 0.032794076949357986, -0.03235733509063721, -0.0705694779753685, -0.0941174328327179, -0.051176246255636215, 0.08234924077987671, -0.020688237622380257, 0.026870127767324448, -0.031070750206708908, 0.021878499537706375, -0.06237325817346573, 0.07108485698699951, 0.0030630987603217363, -0.06985890865325928, -0.05954312905669212, -0.05837850645184517, -0.09073222428560257, 0.005469962954521179, -0.021687401458621025, 0.0314265601336956, -0.025661440566182137, -0.0495171844959259, 0.0394166000187397, -0.029094435274600983, -0.018130596727132797, -0.04031619802117348, 0.08927112817764282, 0.00014257561997510493, -0.026646623387932777, 0.06340110301971436, 0.07394086569547653, 0.014260515570640564, -0.023962723091244698, -0.06585869938135147, 0.04496406018733978, 0.04277855530381203, 0.008617856539785862, 0.0665624663233757, 0.026723850518465042, 0.01059289276599884, 0.011615158058702946, -0.04054207354784012, -0.04994109272956848, 0.10845799744129181, 0.036834508180618286, 0.045918650925159454, -0.05060620605945587, 0.11201019585132599, -0.11668886244297028, -0.01581607758998871, 0.0960628017783165, -0.0488315187394619, 0.024895356968045235, -0.04963228479027748, -0.03182365745306015, -0.004189752042293549, -0.022618744522333145, -0.020297333598136902, 0.010558796115219593, -0.03451183810830116, -0.08592583984136581, 0.07002798467874527, -0.0014977692626416683, -0.020605681464076042, 0.0009889955399557948, -0.06769613176584244, -0.016587721183896065, -0.03945926949381828, 0.027652334421873093, -0.0037252188194543123, 4.02796795242466e-05, 2.496357863577944e-34, -0.019553543999791145, -0.006931365933269262, 0.05519813671708107, 0.030014386400580406, -0.027222076430916786, -0.0040949187241494656, 0.028509650379419327, 0.0003461719024926424, -0.07768791913986206, 0.026781603693962097, -0.021593185141682625, -0.043786026537418365, 0.03954899311065674, -0.029267827048897743, 0.03505752608180046, 0.005345764569938183, -0.01677117310464382, 0.08446278423070908, 0.05020565167069435, 0.041258785873651505, 0.03950535133481026, 0.05992049351334572, 0.004634900484234095, -0.0946483463048935, -0.028090720996260643, -0.03398402780294418, -0.02709619328379631, -0.04133094474673271, -0.005644459743052721, 0.032718855887651443, 0.010113613680005074, -0.02065439336001873, -0.016786033287644386, 0.03233509510755539, -0.06616782397031784, 0.029395416378974915, -0.00663745915517211, -0.06478383392095566, -0.09521140158176422, -0.010280981659889221, -0.03638819605112076, -0.007304533384740353, 0.13017326593399048, -0.06668204814195633, -0.012214419431984425, 0.09507791697978973, -0.0009454676182940602, 0.045288313180208206, 0.061766546219587326, 0.06407830119132996, -0.06472055613994598, 0.02868455834686756, 0.014445719309151173, 0.03761356323957443, 0.04157082363963127, 0.007912926375865936, -0.028237026184797287, -0.048911020159721375, 0.05634745582938194, 0.0031706185545772314, 0.024482648819684982, -0.0926365926861763, -0.028224240988492966, 0.01816745474934578, -0.0009234159952029586, -0.06061384454369545, 0.02713773585855961, -0.0657828152179718, 0.06030780076980591, 0.05763610824942589, -0.0024990146048367023, -0.031143246218562126, 0.014573169872164726, 0.05780758708715439, -0.005530690308660269, -0.024387281388044357, 0.025631394237279892, 0.04571927711367607, -0.07182186841964722, 0.02106345444917679, 0.047523558139801025, -0.025845326483249664, 0.04639439284801483, -0.0461527556180954, 0.06309600919485092, 0.002871520584449172, -0.019818803295493126, -0.01131194643676281, 0.04196448624134064, -0.017453346401453018, -0.043370626866817474, 0.06779050827026367, -0.11423997581005096, -0.007464131806045771, 0.07379034906625748, -1.0159212682046505e-33, 0.04116467386484146, -0.02187393046915531, -0.06464317440986633, -0.04831999912858009, 0.054312679916620255, -0.04359174892306328, 0.10390615463256836, -0.008244805969297886, 0.02429776079952717, 0.08679671585559845, 0.03324231505393982, -0.04018168896436691, 0.023248450830578804, -0.11267966777086258, 0.027334723621606827, -0.018510276451706886, -0.015763893723487854, -0.06620948016643524, -0.029428796842694283, 0.024292776361107826, -0.0836699977517128, 0.06186313182115555, 0.00979425199329853, 0.0149845527485013, 0.02952435240149498, -0.01609259471297264, 0.06341543793678284, 0.025381680577993393, -0.07650972157716751, -0.08898097276687622, 0.0543917752802372, 0.029732191935181618, -0.12705901265144348, 0.11817684024572372, 0.05331788584589958, -0.03143112361431122, 0.0274629145860672, 0.007251844275742769, -0.031150249764323235, 0.0817786380648613, 0.01751711592078209, 0.07238985598087311, -0.006944955326616764, -0.0723976194858551, 0.034229815006256104, -0.003155543003231287, 0.011516829021275043, -0.06810746341943741, 0.09528303891420364, -0.03101549670100212, 0.04598725214600563, -0.032259490340948105, 0.07952931523323059, 0.011015753261744976, 0.07233146578073502, 0.04757140204310417, 0.07436589896678925, 0.03568919375538826, -0.05899377539753914, -0.07132003456354141, 0.02570781111717224, 0.05620163306593895, 0.029458558186888695, 0.07280883193016052, 0.014483439736068249, -0.09305085241794586, 0.04503859579563141, -0.07544805109500885, 0.04793871194124222, -0.0066075995564460754, -0.027827860787510872, -0.07631555944681168, -0.05412726849317551, 0.056384310126304626, 0.056813593953847885, 0.06885606050491333, -0.001682625850662589, -0.021189114078879356, -0.004618695937097073, -0.04061309993267059, 0.10019382834434509, -0.030752010643482208, 0.036137741059064865, 0.035284142941236496, 0.022952962666749954, 0.0072324820794165134, 0.0515342652797699, 0.020784474909305573, 0.005023692734539509, 0.019894951954483986, 0.05247249826788902, 0.020828237757086754, -0.010321374982595444, 0.0026851524598896503, 0.0014503364218398929, -1.771797109029194e-08, -0.07890938222408295, -0.10603849589824677, -0.04075992852449417, 0.07047312706708908, -0.053525179624557495, 0.028504792600870132, -0.01275587547570467, -0.04736935719847679, -0.044071078300476074, -0.016645105555653572, -0.04981076717376709, -0.010642158798873425, 0.017387278378009796, 0.015506042167544365, -0.02702799066901207, -0.06912237405776978, -0.006346073932945728, 0.048564061522483826, 0.019542649388313293, -0.10184305161237717, -0.02131459303200245, 0.002071274910122156, 0.06019570678472519, -0.04933277890086174, -0.023822331801056862, 0.061753757297992706, 0.03395755961537361, 0.035142987966537476, 0.04514467716217041, -0.04209870100021362, 0.051735058426856995, -0.010264404118061066, 0.010600893758237362, -0.04388001188635826, 0.048436664044857025, 0.09170644730329514, 0.0874226912856102, 0.02946961112320423, -0.0049003129824995995, 0.03189241513609886, -0.05068569630384445, 0.04898029565811157, 0.06254067271947861, -0.021246548742055893, 0.041442159563302994, -0.04294992610812187, -0.11569153517484665, -0.029132820665836334, 0.027501607313752174, -0.11903877556324005, -0.0024651181884109974, -0.019488628953695297, 0.032330770045518875, 0.014155727811157703, -0.019860858097672462, -0.03563971444964409, 0.03158700466156006, 0.04575197398662567, -0.04244818910956383, 0.007442069705575705, 0.12420977652072906, -0.0006733344052918255, 0.0338529571890831, -0.03671126440167427]\n"
     ]
    }
   ],
   "source": [
    "from gpt4all import GPT4All, Embed4All\n",
    "text = 'Aditya_test.txt'\n",
    "embedder = Embed4All()\n",
    "output = embedder.embed(text)\n",
    "print(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import langchain_community as lcc\n",
    "from langchain_community.chat_models import ChatHuggingFace\n",
    "\n",
    "local_llm = 'NousResearch/Yarn-Mistral-7b-128k'\n",
    "llm = ChatOllama(model=local_llm, temperature=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bert_load_from_file: gguf version     = 2\n",
      "bert_load_from_file: gguf alignment   = 32\n",
      "bert_load_from_file: gguf data offset = 695552\n",
      "bert_load_from_file: model name           = BERT\n",
      "bert_load_from_file: model architecture   = bert\n",
      "bert_load_from_file: model file type      = 1\n",
      "bert_load_from_file: bert tokenizer vocab = 30522\n"
     ]
    }
   ],
   "source": [
    "from langchain_community.embeddings import GPT4AllEmbeddings\n",
    "\n",
    "embedder = GPT4AllEmbeddings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'dict' object has no attribute 'page_content'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[37], line 13\u001b[0m\n\u001b[1;32m     10\u001b[0m adjusted_documents \u001b[38;5;241m=\u001b[39m [{\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpage_content\u001b[39m\u001b[38;5;124m'\u001b[39m: doc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m'\u001b[39m], \u001b[38;5;124m'\u001b[39m\u001b[38;5;124membedding\u001b[39m\u001b[38;5;124m'\u001b[39m: doc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124membedding\u001b[39m\u001b[38;5;124m'\u001b[39m]} \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m# Then, attempt to create the vector store with the adjusted document format\u001b[39;00m\n\u001b[0;32m---> 13\u001b[0m vectorstore \u001b[38;5;241m=\u001b[39m \u001b[43mChroma\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_documents\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     14\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdocuments\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madjusted_documents\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     15\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrag-chroma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     16\u001b[0m \u001b[43m    \u001b[49m\u001b[43membedding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membedder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     17\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m     18\u001b[0m retriever \u001b[38;5;241m=\u001b[39m vectorstore\u001b[38;5;241m.\u001b[39mas_retriever()\n",
      "File \u001b[0;32m~/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:776\u001b[0m, in \u001b[0;36mChroma.from_documents\u001b[0;34m(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)\u001b[0m\n\u001b[1;32m    745\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m    746\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[1;32m    747\u001b[0m     \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    756\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m    757\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[1;32m    758\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[1;32m    759\u001b[0m \n\u001b[1;32m    760\u001b[0m \u001b[38;5;124;03m    If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    774\u001b[0m \u001b[38;5;124;03m        Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[1;32m    775\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 776\u001b[0m     texts \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m    777\u001b[0m     metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m    778\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[1;32m    779\u001b[0m         texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[1;32m    780\u001b[0m         embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    788\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    789\u001b[0m     )\n",
      "File \u001b[0;32m~/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:776\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    745\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m    746\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[1;32m    747\u001b[0m     \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    756\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m    757\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[1;32m    758\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[1;32m    759\u001b[0m \n\u001b[1;32m    760\u001b[0m \u001b[38;5;124;03m    If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    774\u001b[0m \u001b[38;5;124;03m        Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[1;32m    775\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 776\u001b[0m     texts \u001b[38;5;241m=\u001b[39m [\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m    777\u001b[0m     metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m    778\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[1;32m    779\u001b[0m         texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[1;32m    780\u001b[0m         embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    788\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    789\u001b[0m     )\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'page_content'"
     ]
    }
   ],
   "source": [
    "from langchain_community.vectorstores import Chroma\n",
    "\n",
    "# Example of preparing 'documents' variable (assuming each document is a string in a list)\n",
    "# Here you would convert each text document into an embedding and prepare it as needed\n",
    "\n",
    "# Assuming 'embedder.embed(doc_text)' returns a numeric vector for each document\n",
    "documents = [{'text': doc_text, 'embedding': embedder.embed(doc_text)} for doc_text in documents_list]\n",
    "\n",
    "# If Chroma expects a 'page_content' attribute, adjust your dictionaries accordingly\n",
    "adjusted_documents = [{'page_content': doc['text'], 'embedding': doc['embedding']} for doc in documents]\n",
    "\n",
    "# Then, attempt to create the vector store with the adjusted document format\n",
    "vectorstore = Chroma.from_documents(\n",
    "    documents=adjusted_documents,\n",
    "    collection_name=\"rag-chroma\",\n",
    "    embedding=embedder,\n",
    ")\n",
    "retriever = vectorstore.as_retriever()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assuming 'query' is defined and TextLoader is set up\n",
    "query = \"who is Aditya\"\n",
    "documents = TextLoader.load_documents(query)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "cannot import name 'Rag' from 'langchain.llms' (/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain/llms/__init__.py)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[27], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Rag\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# Initialize RAG model (ensure you have a compatible model loaded)\u001b[39;00m\n\u001b[1;32m      4\u001b[0m rag_model \u001b[38;5;241m=\u001b[39m Rag()\n",
      "\u001b[0;31mImportError\u001b[0m: cannot import name 'Rag' from 'langchain.llms' (/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain/llms/__init__.py)"
     ]
    }
   ],
   "source": [
    "from langchain_community.llms import Rag\n",
    "\n",
    "# Initialize RAG model (ensure you have a compatible model loaded)\n",
    "rag_model = Rag()\n",
    "\n",
    "# Example function to generate answers using RAG and the retrieved documents\n",
    "def generate_answer(rag_model, query, documents):\n",
    "    # Convert documents to a format suitable for the model, if necessary\n",
    "    context = ' '.join(documents)  # Simplified; you might need a more sophisticated approach\n",
    "    \n",
    "    # Generate an answer using the RAG model\n",
    "    answer = rag_model.generate(query, context, \n",
    "                                generation_kwargs={\"max_length\": 256, \"temperature\": 0.7})\n",
    "    return answer\n",
    "\n",
    "# Generate an answer for the query using retrieved documents as context\n",
    "answer = generate_answer(rag_model, query, documents)\n",
    "print(\"Generated Answer:\", answer)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "His previous role as a Software Engineer at Aspire Systems in Chennai, India, showcases Aditya's versatility in both backend and frontend development. Leading the redesign of a Life Insurance Company's architecture, he prioritized low latency and high throughput, emphasizing a customer-centric approach. Aditya engineered 20 SOAP APIs for responsive patient data management, collaborated on front-end enhancements, and implemented secure payment gateways and Single Sign-On for authentication. His contribution to debugging strategies, real-time log analysis with Splunk, and CI/CD pipelines with Jenkins further underscore his commitment to optimizing system performance.\n"
     ]
    }
   ],
   "source": [
    "# Example structure for fine-tuning (high-level and simplified)\n",
    "from langchain.training import train_model\n",
    "\n",
    "# Define your training dataset\n",
    "training_data = [(\"Question 1\", \"Answer 1\"), (\"Question 2\", \"Answer 2\"), ...]\n",
    "\n",
    "# Train (fine-tune) the model\n",
    "train_model(rag_model, training_data, epochs=5, learning_rate=1e-5)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
     ]
    }
   ],
   "source": [
    "from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration\n",
    "\n",
    "tokenizer = RagTokenizer.from_pretrained(\"facebook/rag-token-base\")\n",
    "retriever = RagRetriever.from_pretrained(\"facebook/rag-token-base\")\n",
    "generator = RagTokenForGeneration.from_pretrained(\"facebook/rag-token-base\")\n",
    "\n",
    "\n",
    "def generate_answer(tokenizer, retriever, generator, query, documents):\n",
    "    inputs = tokenizer(query, documents, return_tensors=\"pt\", padding=\"max_length\", max_length=256, truncation=True)\n",
    "    input_ids = inputs[\"input_ids\"]\n",
    "    attention_mask = inputs[\"attention_mask\"]\n",
    "    doc_scores = retriever(input_ids, attention_mask)\n",
    "    context_input_ids = input_ids.new_full((input_ids.shape[0], 1), tokenizer.context_id, dtype=torch.long)\n",
    "    context_attention_mask = input_ids.new_full(context_input_ids.shape, 1)\n",
    "    generator_input_ids = torch.cat([context_input_ids, input_ids], dim=1)\n",
    "    generator_attention_mask = torch.cat([context_attention_mask, attention_mask], dim=1)\n",
    "    outputs = generator.generate(generator_input_ids, attention_mask=generator_attention_mask, doc_scores=doc_scores)\n",
    "    return tokenizer.batch_decode(outputs, skip_special_tokens=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'haystack.indexing'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtimeit\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcleaning\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m clean_wiki_text\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m open_file, fetch_archive_from_http\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m convert_files_to_dicts, fetch_archive_from_http\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'haystack.indexing'"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import timeit\n",
    "# from haystack.indexing.cleaning import clean_wiki_text\n",
    "# from haystack.indexing.io import open_file, fetch_archive_from_http\n",
    "# from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n",
    "from haystack.preprocessor.cleaning import clean_whitespace, clean_html, clean_preprocessor,clean_wiki_text\n",
    "from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http\n",
    "from haystack.preprocessor import PreProcessor\n",
    "from haystack.document_store import InMemoryDocumentStore, WeaviateDocumentStore\n",
    "from haystack.retriever.dense import EmbeddingRetriever\n",
    "from haystack.utils import print_answers\n",
    "\n",
    "def run_ingest():\n",
    "    # Update DATA_PATH to include \"Aditya_train.txt\"\n",
    "    data_file = \"Aditya_train.txt\"\n",
    "    DATA_PATH = os.path.join(cfg.DATA_PATH, data_file)\n",
    "    \n",
    "    # Ensure the file exists\n",
    "    if os.path.isfile(DATA_PATH):\n",
    "        start = timeit.default_timer()\n",
    "\n",
    "        vector_store = WeaviateDocumentStore(host=cfg.WEAVIATE_HOST,\n",
    "                                             port=cfg.WEAVIATE_PORT,\n",
    "                                             embedding_dim=cfg.WEAVIATE_EMBEDDING_DIM)\n",
    "\n",
    "        # Convert text files to dictionaries\n",
    "        raw_docs = convert_files_to_dicts(dir_path=DATA_PATH, clean_func=clean_wiki_text, split_paragraphs=True)\n",
    "\n",
    "        # Convert to desired format\n",
    "        final_doc = []\n",
    "        for doc in raw_docs:\n",
    "            new_doc = {\n",
    "                'content': doc['text'],\n",
    "                'meta': {'name': doc['name']}\n",
    "            }\n",
    "            final_doc.append(new_doc)\n",
    "\n",
    "        preprocessor = PreProcessor(\n",
    "            clean_empty_lines=True,\n",
    "            clean_whitespace=False,\n",
    "            clean_header_footer=False,\n",
    "            split_by=\"word\",\n",
    "            language=\"en\",\n",
    "            split_length=cfg.PRE_PROCESSOR_SPLIT_LENGTH,\n",
    "            split_overlap=cfg.PRE_PROCESSOR_SPLIT_OVERLAP,\n",
    "            split_respect_sentence_boundary=True,\n",
    "        )\n",
    "\n",
    "        preprocessed_docs = preprocessor.process(final_doc)\n",
    "        vector_store.write_documents(preprocessed_docs)\n",
    "\n",
    "        retriever = EmbeddingRetriever(\n",
    "            document_store=vector_store,\n",
    "            embedding_model=cfg.EMBEDDINGS\n",
    "        )\n",
    "        vector_store.update_embeddings(retriever)\n",
    "\n",
    "        end = timeit.default_timer()\n",
    "        print(f\"Time to prepare embeddings: {end - start}\")\n",
    "    else:\n",
    "        print(f\"File {data_file} not found in the specified DATA_PATH.\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Langchain",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}