File size: 4,234 Bytes
e1f19eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import json
import logging
import os
import sys

import gradio as gr
import openai
from dotenv import load_dotenv
from llama_index import SimpleDirectoryReader, VectorStoreIndex, load_index_from_storage
from llama_index.indices.service_context import ServiceContext
from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.text_splitter import TokenTextSplitter

# loads dotenv lib to retrieve API keys from .env file
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

# enable INFO level logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# define LLM service
llm = OpenAI(temperature=0.1, model_name="gpt-3.5-turbo-16k", max_tokens=512)
service_context = ServiceContext.from_defaults(llm=llm)

# construct text splitter to split texts into chunks for processing
text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)

# set the global service context object, avoiding passing service_context when building the index
from llama_index import set_global_service_context

set_global_service_context(service_context)

# create metadata extractor
metadata_extractor = MetadataExtractor(
    extractors=[
        TitleExtractor(nodes=1, llm=llm),
        QuestionsAnsweredExtractor(questions=3, llm=llm),
        SummaryExtractor(summaries=["prev", "self"], llm=llm),
        KeywordExtractor(keywords=10, llm=llm)
    ],
)

# create node parser to parse nodes from document
node_parser = SimpleNodeParser(
    text_splitter=text_splitter,
    metadata_extractor=metadata_extractor,
)

# loading documents
documents_2023 = SimpleDirectoryReader(input_files=["data/data.pdf"], filename_as_id=True).load_data()
print(f"loaded data with {len(documents_2023)} pages")


def load_index():
    try:
        # load storage context
        storage_context = StorageContext.from_defaults(persist_dir="./storage")
        # try to load the index from storage
        index = load_index_from_storage(storage_context)
        logging.info("Index loaded from storage.")

    except FileNotFoundError:
        # if index not found, create a new one
        logging.info("Index not found. Creating a new one...")

        nodes_2023 = node_parser.get_nodes_from_documents(documents_2023)
        print(f"loaded nodes_2022 with {len(nodes_2023)} nodes")

        # print metadata in json format
        for node in nodes_2023:
            metadata_json = json.dumps(node.metadata, indent=4)  # Convert metadata to formatted JSON
            print(metadata_json)

        # based on the nodes and service_context, create index
        index = VectorStoreIndex(nodes=nodes_2023, service_context=service_context)
        # Persist index to disk
        index.storage_context.persist()
        logging.info("New index created and persisted to storage.")

    return index


def data_querying(input_text):
    # Load index
    index = load_index()

    # queries the index with the input text
    response = index.as_query_engine().query(input_text)

    return response.response


iface = gr.Interface(fn=data_querying,
                     inputs=gr.components.Textbox(lines=3, label="Inserte su pregunta aquí"),
                     outputs=gr.components.Textbox(lines=3, label="Respuesta"),
                     examples=[
                         ["¿Quien eres?"],
                         ["¿Cómo te llamas?"],
                         ["¿Cómo puedes ayudarme?"],
                         ["¿Qué puedo hacer si perdí mi contraseña?"],
                         ["¿Qué puedo hacer si perdí mi tarjeta de cŕedito?"],
                         ["Genera un reporte o lista de pasos de toda la info que necesito para abrir una cuenta"],
                     ],
                     title="Demo IA Conversacional",
                     description="Pregúntame lo que quieras, te responderé con lo que sé.")

iface.launch(share=False)