|
from pathlib import Path
|
|
|
|
|
|
from langchain_community.document_loaders import (
|
|
CSVLoader,
|
|
PDFMinerLoader,
|
|
PyPDFLoader,
|
|
TextLoader,
|
|
UnstructuredHTMLLoader,
|
|
UnstructuredMarkdownLoader,
|
|
UnstructuredPowerPointLoader,
|
|
UnstructuredWordDocumentLoader,
|
|
WebBaseLoader,
|
|
YoutubeLoader,
|
|
DirectoryLoader,
|
|
)
|
|
|
|
|
|
|
|
LOADER_CLASSES = {
|
|
'.csv': CSVLoader,
|
|
'.doc': UnstructuredWordDocumentLoader,
|
|
'.docx': UnstructuredWordDocumentLoader,
|
|
'.html': UnstructuredHTMLLoader,
|
|
'.md': UnstructuredMarkdownLoader,
|
|
'.pdf': PDFMinerLoader,
|
|
'.ppt': UnstructuredPowerPointLoader,
|
|
'.pptx': UnstructuredPowerPointLoader,
|
|
'.txt': TextLoader,
|
|
'web': WebBaseLoader,
|
|
'directory': DirectoryLoader,
|
|
'youtube': YoutubeLoader,
|
|
}
|
|
|
|
|
|
SUBTITLES_LANGUAGES = ['ru', 'en']
|
|
|
|
|
|
CONTEXT_TEMPLATE = '''Ответь на вопрос при условии контекста.
|
|
|
|
Контекст:
|
|
{context}
|
|
|
|
Вопрос:
|
|
{user_message}
|
|
|
|
Ответ:'''
|
|
|
|
|
|
GENERATE_KWARGS = dict(
|
|
temperature=0.2,
|
|
top_p=0.95,
|
|
top_k=40,
|
|
repeat_penalty=1.0,
|
|
)
|
|
|
|
|
|
LLAMA_MODEL_KWARGS = dict(
|
|
n_gpu_layers=-1,
|
|
verbose=False,
|
|
n_ctx=4096,
|
|
)
|
|
|
|
|
|
LLM_MODELS_PATH = Path('models')
|
|
EMBED_MODELS_PATH = Path('embed_models')
|
|
LLM_MODELS_PATH.mkdir(exist_ok=True)
|
|
EMBED_MODELS_PATH.mkdir(exist_ok=True)
|
|
|
|
|
|
LLM_MODEL_REPOS = [
|
|
|
|
'bartowski/google_gemma-3-4b-it-GGUF',
|
|
|
|
'bartowski/Qwen2.5-3B-Instruct-GGUF',
|
|
|
|
'bartowski/gemma-2-2b-it-GGUF',
|
|
|
|
'bartowski/Qwen2.5-1.5B-Instruct-GGUF',
|
|
|
|
'bartowski/openchat-3.6-8b-20240522-GGUF',
|
|
|
|
'bartowski/Mistral-7B-Instruct-v0.3-GGUF',
|
|
|
|
'bartowski/Llama-3.2-3B-Instruct-GGUF',
|
|
]
|
|
|
|
|
|
START_LLM_MODEL_FILE = 'google_gemma-3-4b-it-Q4_K_M.gguf'
|
|
|
|
|
|
EMBED_MODEL_REPOS = [
|
|
|
|
'intfloat/multilingual-e5-small',
|
|
|
|
'sergeyzh/rubert-tiny-turbo',
|
|
|
|
'sergeyzh/BERTA',
|
|
|
|
'cointegrated/rubert-tiny2',
|
|
|
|
'cointegrated/LaBSE-en-ru',
|
|
|
|
'sergeyzh/LaBSE-ru-turbo',
|
|
|
|
'intfloat/multilingual-e5-large',
|
|
|
|
'intfloat/multilingual-e5-base',
|
|
|
|
'intfloat/multilingual-e5-large-instruct',
|
|
|
|
'sentence-transformers/all-mpnet-base-v2',
|
|
|
|
'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
|
|
|
|
'ai-forever/ruElectra-medium',
|
|
|
|
'ai-forever/sbert_large_nlu_ru',
|
|
]
|
|
|