Spaces:
Sleeping
Sleeping
import logging | |
from llama_index.core import SimpleDirectoryReader | |
from ragas.testset.generator import TestsetGenerator | |
from ragas.testset.evolutions import simple, reasoning, multi_context | |
from llama_index.llms.openai import OpenAI | |
from llama_index.embeddings.openai import OpenAIEmbedding | |
from preprocessing import preprocess_documents | |
DIRECTORY_RAW_DATA: str = "./data/self_scraped/sample_epub" | |
DIRECTORY_VECTORSTORE: str = "./storage/first_vectorstore" | |
TESTSET_PATH: str = "data/self_scraped/test/sample_epub_20q.csv" | |
# TESTSET_PATH_HF: str = "data/self_scraped/test/sample_epub_hf" | |
# generator with openai models | |
generator_llm = OpenAI(model="gpt-3.5-turbo-16k") | |
critic_llm = OpenAI(model="gpt-4o-mini") | |
embeddings = OpenAIEmbedding() | |
generator = TestsetGenerator.from_llama_index( | |
generator_llm=generator_llm, | |
critic_llm=critic_llm, | |
embeddings=embeddings, | |
) | |
documents = SimpleDirectoryReader(DIRECTORY_RAW_DATA).load_data(show_progress=True) | |
preped_documents = preprocess_documents(documents) | |
# generate testset | |
testset = generator.generate_with_llamaindex_docs( | |
preped_documents, | |
test_size=5, | |
distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}, | |
) | |
df = testset.to_pandas() | |
df.to_csv(TESTSET_PATH) | |
logging.info("Testset has been saved under %s", TESTSET_PATH) | |