import streamlit as st import os import textwrap import torch import chromadb import langchain import openai from langchain.chains import RetrievalQA from langchain.chat_models import ChatOpenAI from langchain.document_loaders import TextLoader, UnstructuredPDFLoader, YoutubeLoader from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings, HuggingFaceInstructEmbeddings from langchain.indexes import VectorstoreIndexCreator from langchain.llms import OpenAI, HuggingFacePipeline from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.prompts import PromptTemplate #from auto_gptq import AutoGPTQForCausalLM from transformers import AutoTokenizer, pipeline, logging, TextStreamer from langchain.document_loaders.image import UnstructuredImageLoader x = st.slider('Select a value') st.write(x, 'squared is', x * x) current_working_directory = os.getcwd() print(current_working_directory) st.write('current dir:', current_working_directory) arr = os.listdir('.') st.write('dir contents:',arr) def print_response(response: str): print("\n".join(textwrap.wrap(response, width=100))) pdf_loader = UnstructuredPDFLoader("./pdfs/Predicting issue types on GitHub.pdf") pdf_pages = pdf_loader.load_and_split() st.write('total pages from PDFs:', len(pdf_pages)) text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=512) texts = text_splitter.split_documents(pdf_pages) st.write('total chunks from pages:', len(texts)) st.write('loading chunks into vector db') model_name = "hkunlp/instructor-large" hf_embeddings = HuggingFaceInstructEmbeddings( model_name = model_name) # db = Chroma.from_documents(texts, hf_embeddings) st.write('loading tokenizer') #model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ" model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF" #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) model_basename = "model" use_triton = False DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" st.write('loading LLM') model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device=DEVICE, use_triton=use_triton, quantize_config=None) st.write('setting up the chain') streamer = TextStreamer(tokenizer, skip_prompt = True, skip_special_tokens = True) text_pipeline = pipeline(task = 'text-generation', model = model, tokenizer = tokenizer, streamer = streamer) llm = HuggingFacePipeline(pipeline = text_pipeline) def generate_prompt(prompt, sys_prompt): return f"[INST] <> {sys_prompt} <> {prompt} [/INST]" sys_prompt = "Use following piece of context to answer the question in less than 20 words" template = generate_prompt( """ {context} Question : {question} """ , sys_prompt) prompt = PromptTemplate(template=template, input_variables=["context", "question"]) chain_type_kwargs = {"prompt": prompt} qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=db.as_retriever(search_kwargs={"k": 2}), return_source_documents = True, chain_type_kwargs=chain_type_kwargs, ) st.write('READY!!!') q1="what the author worked on ?" q2="where did author study?" q3="what author did ?" result = qa_chain(q1) st.write('question:', q1, 'result:', result) result = qa_chain(q2) st.write('question:', q2, 'result:', result) result = qa_chain(q3) st.write('question:', q3, 'result:', result)