File size: 1,565 Bytes
5c8cbfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*- coding: utf-8 -*-
"""Save  MPI Mining Data as Text Backup

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1bpGk38t5or9A4MCj8QVPLdTRH0D9qIN-
"""

!git clone https://github.com/ricklon/mpi_data.git
# todo: pages store the enture colletion to embedding as well.

!pip install -q langchain pypdf langchain_community sentence_transformers faiss-cpu pandas tqdm

import os
import  pandas as pd
from io import StringIO
from tqdm import tqdm
import time
import datetime

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint

from langchain.document_loaders import PyPDFDirectoryLoader

EMBEDDING_MODEL_NAME = "thenlper/gte-small"
HG_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
# hg_model = "HuggingFaceH4/zephyr-7b-beta"
#hg_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ## Tiny LLAMA not supported

llm = HuggingFaceEndpoint(repo_id=HG_MODEL,
  max_new_tokens=250,
  top_k=10,
  top_p=0.95,
  typical_p=0.95,
  temperature=0.01,
  repetition_penalty=1.035)

VECTOR_SOURCE="miningdata"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
)