{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bert_load_from_file: gguf version = 2\n", "bert_load_from_file: gguf alignment = 32\n", "bert_load_from_file: gguf data offset = 695552\n", "bert_load_from_file: model name = BERT\n", "bert_load_from_file: model architecture = bert\n", "bert_load_from_file: model file type = 1\n", "bert_load_from_file: bert tokenizer vocab = 30522\n", "[0.01552767027169466, 0.08103805035352707, -0.12307794392108917, 0.09815496951341629, 0.023653453215956688, -0.06102974712848663, 0.07934562116861343, 0.02745242230594158, -0.028132867068052292, 0.03221212700009346, 0.12919503450393677, 0.0025996030308306217, -0.04139482602477074, -0.06577245146036148, -0.014648980461061, 0.015588296577334404, -0.08434717357158661, -0.07182654738426208, 0.014775916934013367, -0.07444048672914505, 0.0590442530810833, 0.04814479872584343, 0.06639457494020462, 0.008800982497632504, -0.017847837880253792, -0.020949387922883034, -0.026810096576809883, 0.026885343715548515, -0.0764176994562149, -0.057069629430770874, 0.039454489946365356, 0.06288687884807587, 0.036681558936834335, 0.03875448554754257, 0.09926188737154007, 0.07691209763288498, -0.0007747725467197597, -0.05224066600203514, -0.06268111616373062, -0.00026997251552529633, 0.06668399274349213, -0.10031015425920486, -0.00970512256026268, -0.01601257175207138, -0.03624574467539787, -0.10884801298379898, -0.027961881831288338, -0.02198118157684803, 0.011900517158210278, -0.005993946921080351, -0.08890494704246521, -0.01797824539244175, -0.040237877517938614, -0.049093399196863174, -0.019428042694926262, -0.005168401636183262, 0.032794076949357986, -0.03235733509063721, -0.0705694779753685, -0.0941174328327179, -0.051176246255636215, 0.08234924077987671, -0.020688237622380257, 0.026870127767324448, -0.031070750206708908, 0.021878499537706375, -0.06237325817346573, 0.07108485698699951, 0.0030630987603217363, -0.06985890865325928, -0.05954312905669212, -0.05837850645184517, -0.09073222428560257, 0.005469962954521179, -0.021687401458621025, 0.0314265601336956, -0.025661440566182137, -0.0495171844959259, 0.0394166000187397, -0.029094435274600983, -0.018130596727132797, -0.04031619802117348, 0.08927112817764282, 0.00014257561997510493, -0.026646623387932777, 0.06340110301971436, 0.07394086569547653, 0.014260515570640564, -0.023962723091244698, -0.06585869938135147, 0.04496406018733978, 0.04277855530381203, 0.008617856539785862, 0.0665624663233757, 0.026723850518465042, 0.01059289276599884, 0.011615158058702946, -0.04054207354784012, -0.04994109272956848, 0.10845799744129181, 0.036834508180618286, 0.045918650925159454, -0.05060620605945587, 0.11201019585132599, -0.11668886244297028, -0.01581607758998871, 0.0960628017783165, -0.0488315187394619, 0.024895356968045235, -0.04963228479027748, -0.03182365745306015, -0.004189752042293549, -0.022618744522333145, -0.020297333598136902, 0.010558796115219593, -0.03451183810830116, -0.08592583984136581, 0.07002798467874527, -0.0014977692626416683, -0.020605681464076042, 0.0009889955399557948, -0.06769613176584244, -0.016587721183896065, -0.03945926949381828, 0.027652334421873093, -0.0037252188194543123, 4.02796795242466e-05, 2.496357863577944e-34, -0.019553543999791145, -0.006931365933269262, 0.05519813671708107, 0.030014386400580406, -0.027222076430916786, -0.0040949187241494656, 0.028509650379419327, 0.0003461719024926424, -0.07768791913986206, 0.026781603693962097, -0.021593185141682625, -0.043786026537418365, 0.03954899311065674, -0.029267827048897743, 0.03505752608180046, 0.005345764569938183, -0.01677117310464382, 0.08446278423070908, 0.05020565167069435, 0.041258785873651505, 0.03950535133481026, 0.05992049351334572, 0.004634900484234095, -0.0946483463048935, -0.028090720996260643, -0.03398402780294418, -0.02709619328379631, -0.04133094474673271, -0.005644459743052721, 0.032718855887651443, 0.010113613680005074, -0.02065439336001873, -0.016786033287644386, 0.03233509510755539, -0.06616782397031784, 0.029395416378974915, -0.00663745915517211, -0.06478383392095566, -0.09521140158176422, -0.010280981659889221, -0.03638819605112076, -0.007304533384740353, 0.13017326593399048, -0.06668204814195633, -0.012214419431984425, 0.09507791697978973, -0.0009454676182940602, 0.045288313180208206, 0.061766546219587326, 0.06407830119132996, -0.06472055613994598, 0.02868455834686756, 0.014445719309151173, 0.03761356323957443, 0.04157082363963127, 0.007912926375865936, -0.028237026184797287, -0.048911020159721375, 0.05634745582938194, 0.0031706185545772314, 0.024482648819684982, -0.0926365926861763, -0.028224240988492966, 0.01816745474934578, -0.0009234159952029586, -0.06061384454369545, 0.02713773585855961, -0.0657828152179718, 0.06030780076980591, 0.05763610824942589, -0.0024990146048367023, -0.031143246218562126, 0.014573169872164726, 0.05780758708715439, -0.005530690308660269, -0.024387281388044357, 0.025631394237279892, 0.04571927711367607, -0.07182186841964722, 0.02106345444917679, 0.047523558139801025, -0.025845326483249664, 0.04639439284801483, -0.0461527556180954, 0.06309600919485092, 0.002871520584449172, -0.019818803295493126, -0.01131194643676281, 0.04196448624134064, -0.017453346401453018, -0.043370626866817474, 0.06779050827026367, -0.11423997581005096, -0.007464131806045771, 0.07379034906625748, -1.0159212682046505e-33, 0.04116467386484146, -0.02187393046915531, -0.06464317440986633, -0.04831999912858009, 0.054312679916620255, -0.04359174892306328, 0.10390615463256836, -0.008244805969297886, 0.02429776079952717, 0.08679671585559845, 0.03324231505393982, -0.04018168896436691, 0.023248450830578804, -0.11267966777086258, 0.027334723621606827, -0.018510276451706886, -0.015763893723487854, -0.06620948016643524, -0.029428796842694283, 0.024292776361107826, -0.0836699977517128, 0.06186313182115555, 0.00979425199329853, 0.0149845527485013, 0.02952435240149498, -0.01609259471297264, 0.06341543793678284, 0.025381680577993393, -0.07650972157716751, -0.08898097276687622, 0.0543917752802372, 0.029732191935181618, -0.12705901265144348, 0.11817684024572372, 0.05331788584589958, -0.03143112361431122, 0.0274629145860672, 0.007251844275742769, -0.031150249764323235, 0.0817786380648613, 0.01751711592078209, 0.07238985598087311, -0.006944955326616764, -0.0723976194858551, 0.034229815006256104, -0.003155543003231287, 0.011516829021275043, -0.06810746341943741, 0.09528303891420364, -0.03101549670100212, 0.04598725214600563, -0.032259490340948105, 0.07952931523323059, 0.011015753261744976, 0.07233146578073502, 0.04757140204310417, 0.07436589896678925, 0.03568919375538826, -0.05899377539753914, -0.07132003456354141, 0.02570781111717224, 0.05620163306593895, 0.029458558186888695, 0.07280883193016052, 0.014483439736068249, -0.09305085241794586, 0.04503859579563141, -0.07544805109500885, 0.04793871194124222, -0.0066075995564460754, -0.027827860787510872, -0.07631555944681168, -0.05412726849317551, 0.056384310126304626, 0.056813593953847885, 0.06885606050491333, -0.001682625850662589, -0.021189114078879356, -0.004618695937097073, -0.04061309993267059, 0.10019382834434509, -0.030752010643482208, 0.036137741059064865, 0.035284142941236496, 0.022952962666749954, 0.0072324820794165134, 0.0515342652797699, 0.020784474909305573, 0.005023692734539509, 0.019894951954483986, 0.05247249826788902, 0.020828237757086754, -0.010321374982595444, 0.0026851524598896503, 0.0014503364218398929, -1.771797109029194e-08, -0.07890938222408295, -0.10603849589824677, -0.04075992852449417, 0.07047312706708908, -0.053525179624557495, 0.028504792600870132, -0.01275587547570467, -0.04736935719847679, -0.044071078300476074, -0.016645105555653572, -0.04981076717376709, -0.010642158798873425, 0.017387278378009796, 0.015506042167544365, -0.02702799066901207, -0.06912237405776978, -0.006346073932945728, 0.048564061522483826, 0.019542649388313293, -0.10184305161237717, -0.02131459303200245, 0.002071274910122156, 0.06019570678472519, -0.04933277890086174, -0.023822331801056862, 0.061753757297992706, 0.03395755961537361, 0.035142987966537476, 0.04514467716217041, -0.04209870100021362, 0.051735058426856995, -0.010264404118061066, 0.010600893758237362, -0.04388001188635826, 0.048436664044857025, 0.09170644730329514, 0.0874226912856102, 0.02946961112320423, -0.0049003129824995995, 0.03189241513609886, -0.05068569630384445, 0.04898029565811157, 0.06254067271947861, -0.021246548742055893, 0.041442159563302994, -0.04294992610812187, -0.11569153517484665, -0.029132820665836334, 0.027501607313752174, -0.11903877556324005, -0.0024651181884109974, -0.019488628953695297, 0.032330770045518875, 0.014155727811157703, -0.019860858097672462, -0.03563971444964409, 0.03158700466156006, 0.04575197398662567, -0.04244818910956383, 0.007442069705575705, 0.12420977652072906, -0.0006733344052918255, 0.0338529571890831, -0.03671126440167427]\n" ] } ], "source": [ "from gpt4all import GPT4All, Embed4All\n", "text = 'Aditya_test.txt'\n", "embedder = Embed4All()\n", "output = embedder.embed(text)\n", "print(output)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import langchain_community as lcc\n", "from langchain_community.chat_models import ChatHuggingFace\n", "\n", "local_llm = 'NousResearch/Yarn-Mistral-7b-128k'\n", "llm = ChatOllama(model=local_llm, temperature=0)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bert_load_from_file: gguf version = 2\n", "bert_load_from_file: gguf alignment = 32\n", "bert_load_from_file: gguf data offset = 695552\n", "bert_load_from_file: model name = BERT\n", "bert_load_from_file: model architecture = bert\n", "bert_load_from_file: model file type = 1\n", "bert_load_from_file: bert tokenizer vocab = 30522\n" ] } ], "source": [ "from langchain_community.embeddings import GPT4AllEmbeddings\n", "\n", "embedder = GPT4AllEmbeddings()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'dict' object has no attribute 'page_content'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[37], line 13\u001b[0m\n\u001b[1;32m 10\u001b[0m adjusted_documents \u001b[38;5;241m=\u001b[39m [{\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpage_content\u001b[39m\u001b[38;5;124m'\u001b[39m: doc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m'\u001b[39m], \u001b[38;5;124m'\u001b[39m\u001b[38;5;124membedding\u001b[39m\u001b[38;5;124m'\u001b[39m: doc[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124membedding\u001b[39m\u001b[38;5;124m'\u001b[39m]} \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# Then, attempt to create the vector store with the adjusted document format\u001b[39;00m\n\u001b[0;32m---> 13\u001b[0m vectorstore \u001b[38;5;241m=\u001b[39m \u001b[43mChroma\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_documents\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43mdocuments\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madjusted_documents\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrag-chroma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membedder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 18\u001b[0m retriever \u001b[38;5;241m=\u001b[39m vectorstore\u001b[38;5;241m.\u001b[39mas_retriever()\n", "File \u001b[0;32m~/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:776\u001b[0m, in \u001b[0;36mChroma.from_documents\u001b[0;34m(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)\u001b[0m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 756\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 757\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[1;32m 758\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[1;32m 759\u001b[0m \n\u001b[1;32m 760\u001b[0m \u001b[38;5;124;03m If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 774\u001b[0m \u001b[38;5;124;03m Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[1;32m 775\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 776\u001b[0m texts \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 777\u001b[0m metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 778\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[1;32m 779\u001b[0m texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[1;32m 780\u001b[0m embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 789\u001b[0m )\n", "File \u001b[0;32m~/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain_community/vectorstores/chroma.py:776\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 756\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 757\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[1;32m 758\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[1;32m 759\u001b[0m \n\u001b[1;32m 760\u001b[0m \u001b[38;5;124;03m If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 774\u001b[0m \u001b[38;5;124;03m Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[1;32m 775\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 776\u001b[0m texts \u001b[38;5;241m=\u001b[39m [\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 777\u001b[0m metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 778\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[1;32m 779\u001b[0m texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[1;32m 780\u001b[0m embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 789\u001b[0m )\n", "\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'page_content'" ] } ], "source": [ "from langchain_community.vectorstores import Chroma\n", "\n", "# Example of preparing 'documents' variable (assuming each document is a string in a list)\n", "# Here you would convert each text document into an embedding and prepare it as needed\n", "\n", "# Assuming 'embedder.embed(doc_text)' returns a numeric vector for each document\n", "documents = [{'text': doc_text, 'embedding': embedder.embed(doc_text)} for doc_text in documents_list]\n", "\n", "# If Chroma expects a 'page_content' attribute, adjust your dictionaries accordingly\n", "adjusted_documents = [{'page_content': doc['text'], 'embedding': doc['embedding']} for doc in documents]\n", "\n", "# Then, attempt to create the vector store with the adjusted document format\n", "vectorstore = Chroma.from_documents(\n", " documents=adjusted_documents,\n", " collection_name=\"rag-chroma\",\n", " embedding=embedder,\n", ")\n", "retriever = vectorstore.as_retriever()\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Assuming 'query' is defined and TextLoader is set up\n", "query = \"who is Aditya\"\n", "documents = TextLoader.load_documents(query)\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "ename": "ImportError", "evalue": "cannot import name 'Rag' from 'langchain.llms' (/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain/llms/__init__.py)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[27], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Rag\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Initialize RAG model (ensure you have a compatible model loaded)\u001b[39;00m\n\u001b[1;32m 4\u001b[0m rag_model \u001b[38;5;241m=\u001b[39m Rag()\n", "\u001b[0;31mImportError\u001b[0m: cannot import name 'Rag' from 'langchain.llms' (/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/langchain/llms/__init__.py)" ] } ], "source": [ "from langchain_community.llms import Rag\n", "\n", "# Initialize RAG model (ensure you have a compatible model loaded)\n", "rag_model = Rag()\n", "\n", "# Example function to generate answers using RAG and the retrieved documents\n", "def generate_answer(rag_model, query, documents):\n", " # Convert documents to a format suitable for the model, if necessary\n", " context = ' '.join(documents) # Simplified; you might need a more sophisticated approach\n", " \n", " # Generate an answer using the RAG model\n", " answer = rag_model.generate(query, context, \n", " generation_kwargs={\"max_length\": 256, \"temperature\": 0.7})\n", " return answer\n", "\n", "# Generate an answer for the query using retrieved documents as context\n", "answer = generate_answer(rag_model, query, documents)\n", "print(\"Generated Answer:\", answer)\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "His previous role as a Software Engineer at Aspire Systems in Chennai, India, showcases Aditya's versatility in both backend and frontend development. Leading the redesign of a Life Insurance Company's architecture, he prioritized low latency and high throughput, emphasizing a customer-centric approach. Aditya engineered 20 SOAP APIs for responsive patient data management, collaborated on front-end enhancements, and implemented secure payment gateways and Single Sign-On for authentication. His contribution to debugging strategies, real-time log analysis with Splunk, and CI/CD pipelines with Jenkins further underscore his commitment to optimizing system performance.\n" ] } ], "source": [ "# Example structure for fine-tuning (high-level and simplified)\n", "from langchain.training import train_model\n", "\n", "# Define your training dataset\n", "training_data = [(\"Question 1\", \"Answer 1\"), (\"Question 2\", \"Answer 2\"), ...]\n", "\n", "# Train (fine-tune) the model\n", "train_model(rag_model, training_data, epochs=5, learning_rate=1e-5)\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/adityasugandhi/.local/share/virtualenvs/LLM_Playground-SHCTkmIS/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" ] } ], "source": [ "from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration\n", "\n", "tokenizer = RagTokenizer.from_pretrained(\"facebook/rag-token-base\")\n", "retriever = RagRetriever.from_pretrained(\"facebook/rag-token-base\")\n", "generator = RagTokenForGeneration.from_pretrained(\"facebook/rag-token-base\")\n", "\n", "\n", "def generate_answer(tokenizer, retriever, generator, query, documents):\n", " inputs = tokenizer(query, documents, return_tensors=\"pt\", padding=\"max_length\", max_length=256, truncation=True)\n", " input_ids = inputs[\"input_ids\"]\n", " attention_mask = inputs[\"attention_mask\"]\n", " doc_scores = retriever(input_ids, attention_mask)\n", " context_input_ids = input_ids.new_full((input_ids.shape[0], 1), tokenizer.context_id, dtype=torch.long)\n", " context_attention_mask = input_ids.new_full(context_input_ids.shape, 1)\n", " generator_input_ids = torch.cat([context_input_ids, input_ids], dim=1)\n", " generator_attention_mask = torch.cat([context_attention_mask, attention_mask], dim=1)\n", " outputs = generator.generate(generator_input_ids, attention_mask=generator_attention_mask, doc_scores=doc_scores)\n", " return tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'haystack.indexing'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtimeit\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcleaning\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m clean_wiki_text\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m open_file, fetch_archive_from_http\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhaystack\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m convert_files_to_dicts, fetch_archive_from_http\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'haystack.indexing'" ] } ], "source": [ "import os\n", "import timeit\n", "# from haystack.indexing.cleaning import clean_wiki_text\n", "# from haystack.indexing.io import open_file, fetch_archive_from_http\n", "# from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n", "from haystack.preprocessor.cleaning import clean_whitespace, clean_html, clean_preprocessor,clean_wiki_text\n", "from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http\n", "from haystack.preprocessor import PreProcessor\n", "from haystack.document_store import InMemoryDocumentStore, WeaviateDocumentStore\n", "from haystack.retriever.dense import EmbeddingRetriever\n", "from haystack.utils import print_answers\n", "\n", "def run_ingest():\n", " # Update DATA_PATH to include \"Aditya_train.txt\"\n", " data_file = \"Aditya_train.txt\"\n", " DATA_PATH = os.path.join(cfg.DATA_PATH, data_file)\n", " \n", " # Ensure the file exists\n", " if os.path.isfile(DATA_PATH):\n", " start = timeit.default_timer()\n", "\n", " vector_store = WeaviateDocumentStore(host=cfg.WEAVIATE_HOST,\n", " port=cfg.WEAVIATE_PORT,\n", " embedding_dim=cfg.WEAVIATE_EMBEDDING_DIM)\n", "\n", " # Convert text files to dictionaries\n", " raw_docs = convert_files_to_dicts(dir_path=DATA_PATH, clean_func=clean_wiki_text, split_paragraphs=True)\n", "\n", " # Convert to desired format\n", " final_doc = []\n", " for doc in raw_docs:\n", " new_doc = {\n", " 'content': doc['text'],\n", " 'meta': {'name': doc['name']}\n", " }\n", " final_doc.append(new_doc)\n", "\n", " preprocessor = PreProcessor(\n", " clean_empty_lines=True,\n", " clean_whitespace=False,\n", " clean_header_footer=False,\n", " split_by=\"word\",\n", " language=\"en\",\n", " split_length=cfg.PRE_PROCESSOR_SPLIT_LENGTH,\n", " split_overlap=cfg.PRE_PROCESSOR_SPLIT_OVERLAP,\n", " split_respect_sentence_boundary=True,\n", " )\n", "\n", " preprocessed_docs = preprocessor.process(final_doc)\n", " vector_store.write_documents(preprocessed_docs)\n", "\n", " retriever = EmbeddingRetriever(\n", " document_store=vector_store,\n", " embedding_model=cfg.EMBEDDINGS\n", " )\n", " vector_store.update_embeddings(retriever)\n", "\n", " end = timeit.default_timer()\n", " print(f\"Time to prepare embeddings: {end - start}\")\n", " else:\n", " print(f\"File {data_file} not found in the specified DATA_PATH.\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Langchain", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 2 }