Spaces:

rianders
/

mpi_data_store

Sleeping

mpi_data_store / save_mpi_mining_data_as_text_backup.py

fixed dependency generation

5c8cbfc 4 months ago

No virus

1.57 kB

	# -- coding: utf-8 --
	"""Save MPI Mining Data as Text Backup

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1bpGk38t5or9A4MCj8QVPLdTRH0D9qIN-
	"""

	!git clone https://github.com/ricklon/mpi_data.git
	# todo: pages store the enture colletion to embedding as well.

	!pip install -q langchain pypdf langchain_community sentence_transformers faiss-cpu pandas tqdm

	import os
	import pandas as pd
	from io import StringIO
	from tqdm import tqdm
	import time
	import datetime

	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.runnables import RunnablePassthrough
	from langchain_core.output_parsers import StrOutputParser

	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.llms import HuggingFaceEndpoint

	from langchain.document_loaders import PyPDFDirectoryLoader

	EMBEDDING_MODEL_NAME = "thenlper/gte-small"
	HG_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
	# hg_model = "HuggingFaceH4/zephyr-7b-beta"
	#hg_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ## Tiny LLAMA not supported

	llm = HuggingFaceEndpoint(repo_id=HG_MODEL,
	max_new_tokens=250,
	top_k=10,
	top_p=0.95,
	typical_p=0.95,
	temperature=0.01,
	repetition_penalty=1.035)

	VECTOR_SOURCE="miningdata"

	embedding_model = HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL_NAME,
	multi_process=True,
	model_kwargs={"device": "cpu"},
	encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity
	)