# -*- coding: utf-8 -*- """TinyLlama 1.1B LLM RAG Research Chatbot.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1gKNj3wQw1pUbUXLJ4TcQCW16ezvL8pPo """ !pip install pypdf !pip install python-dotenv !pip install -q transformers !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir !pip install -q llama-index !pip install -q transformers einops accelerate langchain bitsandbytes !pip install sentence_transformers !pip install llama-index-llms-huggingface !pip install -q gradio !pip install einops !pip install accelerate import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) from llama_index.core import SimpleDirectoryReader, VectorStoreIndex from llama_index.llms.huggingface import HuggingFaceLLM from llama_index.core import Settings from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext documents = SimpleDirectoryReader("/content/Data/").load_data() len(documents) documents[10] from llama_index.core import PromptTemplate system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided." # This will wrap the default prompts that are internal to llama-index query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>") from llama_index.llms.huggingface import HuggingFaceLLM import torch llm = HuggingFaceLLM( context_window=2048, max_new_tokens=256, generate_kwargs={"temperature": 0.0, "do_sample": False}, system_prompt=system_prompt, query_wrapper_prompt=query_wrapper_prompt, tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="cuda", # uncomment this if using CUDA to reduce memory usage model_kwargs={"torch_dtype": torch.bfloat16}, ) from langchain.embeddings import HuggingFaceEmbeddings from llama_index.embeddings.langchain import LangchainEmbedding lc_embed_model = HuggingFaceEmbeddings( model_name="BAAI/bge-small-en-v1.5" ) embed_model = LangchainEmbedding(lc_embed_model) service_context = ServiceContext.from_defaults( chunk_size=1024, llm=llm, embed_model=embed_model ) index = VectorStoreIndex.from_documents(documents, service_context=service_context) query_engine = index.as_query_engine() def predict(input, history): response = query_engine.query(input) return str(response) import gradio as gr gr.ChatInterface(predict).launch(share=True)