Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""Save MPI Mining Data as Text Backup | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1bpGk38t5or9A4MCj8QVPLdTRH0D9qIN- | |
""" | |
!git clone https://github.com/ricklon/mpi_data.git | |
# todo: pages store the enture colletion to embedding as well. | |
!pip install -q langchain pypdf langchain_community sentence_transformers faiss-cpu pandas tqdm | |
import os | |
import pandas as pd | |
from io import StringIO | |
from tqdm import tqdm | |
import time | |
import datetime | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_core.runnables import RunnablePassthrough | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.llms import HuggingFaceEndpoint | |
from langchain.document_loaders import PyPDFDirectoryLoader | |
EMBEDDING_MODEL_NAME = "thenlper/gte-small" | |
HG_MODEL = "mistralai/Mistral-7B-Instruct-v0.2" | |
# hg_model = "HuggingFaceH4/zephyr-7b-beta" | |
#hg_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ## Tiny LLAMA not supported | |
llm = HuggingFaceEndpoint(repo_id=HG_MODEL, | |
max_new_tokens=250, | |
top_k=10, | |
top_p=0.95, | |
typical_p=0.95, | |
temperature=0.01, | |
repetition_penalty=1.035) | |
VECTOR_SOURCE="miningdata" | |
embedding_model = HuggingFaceEmbeddings( | |
model_name=EMBEDDING_MODEL_NAME, | |
multi_process=True, | |
model_kwargs={"device": "cpu"}, | |
encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity | |
) |