Spaces:
Sleeping
Sleeping
File size: 1,565 Bytes
5c8cbfc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# -*- coding: utf-8 -*-
"""Save MPI Mining Data as Text Backup
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1bpGk38t5or9A4MCj8QVPLdTRH0D9qIN-
"""
!git clone https://github.com/ricklon/mpi_data.git
# todo: pages store the enture colletion to embedding as well.
!pip install -q langchain pypdf langchain_community sentence_transformers faiss-cpu pandas tqdm
import os
import pandas as pd
from io import StringIO
from tqdm import tqdm
import time
import datetime
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain.document_loaders import PyPDFDirectoryLoader
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
HG_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
# hg_model = "HuggingFaceH4/zephyr-7b-beta"
#hg_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ## Tiny LLAMA not supported
llm = HuggingFaceEndpoint(repo_id=HG_MODEL,
max_new_tokens=250,
top_k=10,
top_p=0.95,
typical_p=0.95,
temperature=0.01,
repetition_penalty=1.035)
VECTOR_SOURCE="miningdata"
embedding_model = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL_NAME,
multi_process=True,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity
) |