# -*- coding: utf-8 -*- """Save MPI Mining Data as Text Backup Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1bpGk38t5or9A4MCj8QVPLdTRH0D9qIN- """ !git clone https://github.com/ricklon/mpi_data.git # todo: pages store the enture colletion to embedding as well. !pip install -q langchain pypdf langchain_community sentence_transformers faiss-cpu pandas tqdm import os import pandas as pd from io import StringIO from tqdm import tqdm import time import datetime from langchain_core.prompts import ChatPromptTemplate from langchain_core.runnables import RunnablePassthrough from langchain_core.output_parsers import StrOutputParser from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.llms import HuggingFaceEndpoint from langchain.document_loaders import PyPDFDirectoryLoader EMBEDDING_MODEL_NAME = "thenlper/gte-small" HG_MODEL = "mistralai/Mistral-7B-Instruct-v0.2" # hg_model = "HuggingFaceH4/zephyr-7b-beta" #hg_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ## Tiny LLAMA not supported llm = HuggingFaceEndpoint(repo_id=HG_MODEL, max_new_tokens=250, top_k=10, top_p=0.95, typical_p=0.95, temperature=0.01, repetition_penalty=1.035) VECTOR_SOURCE="miningdata" embedding_model = HuggingFaceEmbeddings( model_name=EMBEDDING_MODEL_NAME, multi_process=True, model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity )