Spaces:
Paused
Paused
from typing import List, Dict, Tuple | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.schema import Document | |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI | |
def create_prompt(prompt): | |
prompt_template = ChatPromptTemplate.from_template(prompt) | |
return prompt_template | |
def split_documents(documents: List[Document]) -> List[Document]: | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, | |
chunk_overlap=100, | |
length_function=len, | |
is_separator_regex=False | |
) | |
split_docs = text_splitter.split_documents(documents) | |
total_tokens = sum(len(doc.page_content) for doc in split_docs) # Approximate token count | |
return split_docs, total_tokens | |
def generate_embeddings(docs: List[Document]) -> List[List[float]]: | |
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small") | |
embeddings = embeddings_model.embed_documents([doc.page_content for doc in docs]) | |
total_tokens = sum(len(doc.page_content) for doc in docs) # Approximate token count | |
return embeddings, total_tokens | |
def create_qamodel(model="gpt-4o-mini", temperature=0): | |
qamodel = ChatOpenAI( | |
model="gpt-4o-mini", | |
temperature=0 | |
) | |
return qamodel |