import logging from llama_index.core import SimpleDirectoryReader from ragas.testset.generator import TestsetGenerator from ragas.testset.evolutions import simple, reasoning, multi_context from llama_index.llms.openai import OpenAI from llama_index.embeddings.openai import OpenAIEmbedding from preprocessing import preprocess_documents DIRECTORY_RAW_DATA: str = "./data/self_scraped/sample_epub" DIRECTORY_VECTORSTORE: str = "./storage/first_vectorstore" TESTSET_PATH: str = "data/self_scraped/test/sample_epub_20q.csv" # TESTSET_PATH_HF: str = "data/self_scraped/test/sample_epub_hf" # generator with openai models generator_llm = OpenAI(model="gpt-3.5-turbo-16k") critic_llm = OpenAI(model="gpt-4o-mini") embeddings = OpenAIEmbedding() generator = TestsetGenerator.from_llama_index( generator_llm=generator_llm, critic_llm=critic_llm, embeddings=embeddings, ) documents = SimpleDirectoryReader(DIRECTORY_RAW_DATA).load_data(show_progress=True) preped_documents = preprocess_documents(documents) # generate testset testset = generator.generate_with_llamaindex_docs( preped_documents, test_size=5, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}, ) df = testset.to_pandas() df.to_csv(TESTSET_PATH) logging.info("Testset has been saved under %s", TESTSET_PATH)