File size: 1,319 Bytes
3b4f6eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import logging
from llama_index.core import SimpleDirectoryReader
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from preprocessing import preprocess_documents

DIRECTORY_RAW_DATA: str = "./data/self_scraped/sample_epub"
DIRECTORY_VECTORSTORE: str = "./storage/first_vectorstore"
TESTSET_PATH: str = "data/self_scraped/test/sample_epub_20q.csv"
# TESTSET_PATH_HF: str = "data/self_scraped/test/sample_epub_hf"

# generator with openai models
generator_llm = OpenAI(model="gpt-3.5-turbo-16k")
critic_llm = OpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbedding()

generator = TestsetGenerator.from_llama_index(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings,
)





documents = SimpleDirectoryReader(DIRECTORY_RAW_DATA).load_data(show_progress=True)
preped_documents = preprocess_documents(documents)


# generate testset
testset = generator.generate_with_llamaindex_docs(
    preped_documents,
    test_size=5,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)



df = testset.to_pandas()


df.to_csv(TESTSET_PATH)
logging.info("Testset has been saved under %s", TESTSET_PATH)