mpi_data_store / save_mpi_mining_data_as_text_backup.py
rianders's picture
fixed dependency generation
5c8cbfc
raw
history blame contribute delete
No virus
1.57 kB
# -*- coding: utf-8 -*-
"""Save MPI Mining Data as Text Backup
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1bpGk38t5or9A4MCj8QVPLdTRH0D9qIN-
"""
!git clone https://github.com/ricklon/mpi_data.git
# todo: pages store the enture colletion to embedding as well.
!pip install -q langchain pypdf langchain_community sentence_transformers faiss-cpu pandas tqdm
import os
import pandas as pd
from io import StringIO
from tqdm import tqdm
import time
import datetime
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain.document_loaders import PyPDFDirectoryLoader
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
HG_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
# hg_model = "HuggingFaceH4/zephyr-7b-beta"
#hg_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ## Tiny LLAMA not supported
llm = HuggingFaceEndpoint(repo_id=HG_MODEL,
max_new_tokens=250,
top_k=10,
top_p=0.95,
typical_p=0.95,
temperature=0.01,
repetition_penalty=1.035)
VECTOR_SOURCE="miningdata"
embedding_model = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL_NAME,
multi_process=True,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity
)