hkunlp/instructor-xl · TypeError: _load_sbert_model() got an unexpected keyword argument 'token'

May 9, 2024

•

edited May 9, 2024

Attempts:
import streamlit as st # type: ignore
from PyPDF2 import PdfReader # type: ignore
from langchain.text_splitter import RecursiveCharacterTextSplitter # type: ignore
from langchain.embeddings import HuggingFaceInstructEmbeddings # type: ignore
from langchain.vectorstores import FAISS # type: ignore
from dotenv import load_dotenv # type: ignore
from InstructorEmbedding import INSTRUCTOR # type: ignore
from sentence_transformers import SentenceTransformer # Use SentenceTransformer module to use Hugging face Model
#import torch

with st.sidebar:
st.title('LLM Chat App')
st.markdown('''
## About
This app is an LLM-powered chatbot built using:
- Streamlit
- Langchain
- HuggingFace
''')

def main():
load_dotenv()
st.header("Chat with PDF 📄💬")

  # upload PDF file
  pdf = st.file_uploader("Upload your PDF", type='pdf')

  #st.write(pdf) # this code displays file name, indicating if any file is uploaded

  #check if file is uploaded before reading (prevent error on first run, when no doc is uploaded)
  if pdf is not None:
    pdf_reader = PdfReader(pdf)
    #st.write(pdf_reader) 

    text = ""
    for page in pdf_reader.pages:
         # extract text from every page
         text += page.extract_text()

    #st.write(text) #check if text are sucessfully read

    text_splitter = RecursiveCharacterTextSplitter(
         chunk_size = 1000,
         chunk_overlap = 200,
         length_function = len
    )
    chunks = text_splitter.split_text(text=text)

    #st.write(chunks)


    # attempt 1: embeddings (failed)
    #model = INSTRUCTOR('hkunlp/instructor-xl')
    #embeddings = model.encode([[chunk] for chunk in chunks]) 
    
    # attempt 2: embeddings (failed)
    #model = SentenceTransformer('hkunlp/instructor-xl') #loaded for a very long time
    #model = "hkunlp/instructor-xl"
    #embeddings = HuggingFaceInstructEmbeddings(model_name=model)
    

    # attempt 3: embeddings
    #embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
   
    #vectorstore = FAISS.from_text(chunks, embeddings=embeddings)

     # attempt 4 (failed)
      # Initialize the INSTRUCTOR model
    model = INSTRUCTOR('hkunlp/instructor-xl')

    # Encode chunks
    embeddings = []
    for chunk in chunks:
        instruction = "Embed PDF text"
        embeddings.append(model.encode([instruction, chunk]))

    # Initialize vector store
    vectorstore = FAISS.from_text(chunks, embeddings=embeddings)

if name == 'main':

    main()

requirements.txt :
langchain==0.0.154
pyPDF2==3.0.1
python-dotenv==1.0.0
streamlit==1.18.1
faiss-cpu==1.7.4
streamlit-extras
altair==4.1.0
huggingface-hub==0.14.1
InstructorEmbedding==1.0.1
sentence-transformers==2.2.2

using python 3.9