Spaces:
Paused
Paused
File size: 1,297 Bytes
c97d8e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
from typing import List, Dict, Tuple
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
def create_prompt(prompt):
prompt_template = ChatPromptTemplate.from_template(prompt)
return prompt_template
def split_documents(documents: List[Document]) -> List[Document]:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=100,
length_function=len,
is_separator_regex=False
)
split_docs = text_splitter.split_documents(documents)
total_tokens = sum(len(doc.page_content) for doc in split_docs) # Approximate token count
return split_docs, total_tokens
def generate_embeddings(docs: List[Document]) -> List[List[float]]:
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings = embeddings_model.embed_documents([doc.page_content for doc in docs])
total_tokens = sum(len(doc.page_content) for doc in docs) # Approximate token count
return embeddings, total_tokens
def create_qamodel(model="gpt-4o-mini", temperature=0):
qamodel = ChatOpenAI(
model="gpt-4o-mini",
temperature=0
)
return qamodel |