Upload localKB_construct.py
Browse files- localKB_construct.py +101 -0
localKB_construct.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
1.更新了llama-index的库。对应的函数名和用法都有所改变。
|
| 3 |
+
'''
|
| 4 |
+
|
| 5 |
+
# import gradio as gr
|
| 6 |
+
import openai
|
| 7 |
+
import requests
|
| 8 |
+
import csv
|
| 9 |
+
from llama_index import PromptHelper
|
| 10 |
+
# from llama_index import GPTSimpleVectorIndex ## renamed in the latest version.
|
| 11 |
+
from llama_index import LLMPredictor
|
| 12 |
+
from llama_index import ServiceContext
|
| 13 |
+
from langchain.chat_models import ChatOpenAI
|
| 14 |
+
from langchain import OpenAI
|
| 15 |
+
from fastapi import FastAPI #* 实现流式数据
|
| 16 |
+
from fastapi.responses import StreamingResponse #* 实现流式数据
|
| 17 |
+
import sys
|
| 18 |
+
import os
|
| 19 |
+
import torch
|
| 20 |
+
import math
|
| 21 |
+
import pandas as pd
|
| 22 |
+
import numpy as np
|
| 23 |
+
import PyPDF2
|
| 24 |
+
# from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper #* working in the previous version.
|
| 25 |
+
|
| 26 |
+
##* in the latest version: GPTSimpleVectorIndex was renamed to GPTVectorStoreIndex, try removing it from the end of your imports
|
| 27 |
+
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper
|
| 28 |
+
from llama_index import StorageContext, load_index_from_storage
|
| 29 |
+
from llama_index import ServiceContext
|
| 30 |
+
from llama_index import download_loader
|
| 31 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
| 32 |
+
import sys
|
| 33 |
+
import os
|
| 34 |
+
from rich import print
|
| 35 |
+
|
| 36 |
+
## enironment settings.
|
| 37 |
+
os.environ["OPENAI_API_KEY"] = "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"
|
| 38 |
+
openai.api_key = "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"
|
| 39 |
+
# file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
|
| 40 |
+
# file_path = "/Users/yunshi/Downloads/txt_dir/2023年百人会电动论坛 纪要 20230401.pdf"
|
| 41 |
+
|
| 42 |
+
## 建立index或者的过程。
|
| 43 |
+
def construct_index(directory_path):
|
| 44 |
+
# file_path = f"{directory_path}/uploaded_file.pdf"
|
| 45 |
+
|
| 46 |
+
file_path = directory_path
|
| 47 |
+
|
| 48 |
+
# set maximum input si771006
|
| 49 |
+
# max_input_size = 4096 #* working
|
| 50 |
+
max_input_size = 4096
|
| 51 |
+
# set number of output tokens
|
| 52 |
+
# num_outputs = 3000 #* working
|
| 53 |
+
num_outputs = 1000
|
| 54 |
+
# set maximum chunk overlap
|
| 55 |
+
max_chunk_overlap = -1000 #* working
|
| 56 |
+
# set chunk size limit
|
| 57 |
+
# chunk_size_limit = 600
|
| 58 |
+
chunk_size_limit = 6000 #* working
|
| 59 |
+
|
| 60 |
+
# ## add chunk_overlap_ratio according to github.
|
| 61 |
+
# chunk_overlap_ratio= 0.1
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# define LLM
|
| 65 |
+
# llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="gpt-3.5-turbo", max_tokens=2000))
|
| 66 |
+
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo-16k", max_tokens=512,streaming=True))
|
| 67 |
+
|
| 68 |
+
## 好像work了,2023.09.22, 注意这里的写法有调整。
|
| 69 |
+
# prompt_helper = PromptHelper(max_input_s≈ize, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
|
| 70 |
+
prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio= 0.1, chunk_size_limit=chunk_size_limit)
|
| 71 |
+
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
|
| 72 |
+
|
| 73 |
+
## 如果是txt文件,那么需要用如下命令。注意与PDF文件的区别。
|
| 74 |
+
# documents = SimpleDirectoryReader(directory_path).load_data()
|
| 75 |
+
|
| 76 |
+
## 如果是PDF文件,那么需要用如下命令。注意与txt文件的区别。切需要from llama_index import download_loader。
|
| 77 |
+
#NOTE: 这里可以问:give me an example of GPT-4 solving math problem. 会回答关于这个PDF中的内容,所以可以确认这个程序调用了in-context learning的功能。
|
| 78 |
+
CJKPDFReader = download_loader("CJKPDFReader")
|
| 79 |
+
loader = CJKPDFReader()
|
| 80 |
+
# documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
|
| 81 |
+
documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
|
| 82 |
+
# index = GPTSimpleVectorIndex(
|
| 83 |
+
# documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
|
| 84 |
+
# )
|
| 85 |
+
|
| 86 |
+
# index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) ## oringinal version, working.
|
| 87 |
+
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) #* the funciton renamed.
|
| 88 |
+
# index.save_to_disk('/Users/yunshi/Downloads/txt_dir/index.json') ## in the latest version, this function is not working.
|
| 89 |
+
|
| 90 |
+
return index, service_context
|
| 91 |
+
|
| 92 |
+
def process_file():
|
| 93 |
+
print('process_file starts')
|
| 94 |
+
file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
|
| 95 |
+
#! 第一次运行是需要开启这个function。如果测试通过index,因此不需要在运行了。记得上传PDF和JSON文件到云服务器上。
|
| 96 |
+
index, service_context = construct_index(file_path)
|
| 97 |
+
# index.storage_context.persist(persist_dir="/Users/yunshi/Downloads/txt_dir/") #* 存储到本地,为以后调用。
|
| 98 |
+
index.storage_context.persist(persist_dir=f"./") #* 存储到本地,为以后调用。
|
| 99 |
+
print(index)
|
| 100 |
+
|
| 101 |
+
process_file()
|