Upload localKB_construct.py
Browse files- localKB_construct.py +18 -8
localKB_construct.py
CHANGED
|
@@ -2,10 +2,12 @@
|
|
| 2 |
1.更新了llama-index的库。对应的函数名和用法都有所改变。
|
| 3 |
'''
|
| 4 |
|
|
|
|
| 5 |
import openai
|
| 6 |
import requests
|
| 7 |
import csv
|
| 8 |
from llama_index import PromptHelper
|
|
|
|
| 9 |
from llama_index import LLMPredictor
|
| 10 |
from llama_index import ServiceContext
|
| 11 |
from langchain.chat_models import ChatOpenAI
|
|
@@ -18,7 +20,6 @@ import math
|
|
| 18 |
import pandas as pd
|
| 19 |
import numpy as np
|
| 20 |
import PyPDF2
|
| 21 |
-
# from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper #* working in the previous version.
|
| 22 |
|
| 23 |
##* in the latest version: GPTSimpleVectorIndex was renamed to GPTVectorStoreIndex, try removing it from the end of your imports
|
| 24 |
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper
|
|
@@ -28,7 +29,7 @@ from llama_index import download_loader
|
|
| 28 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
| 29 |
import sys
|
| 30 |
import os
|
| 31 |
-
|
| 32 |
|
| 33 |
## enironment settings.
|
| 34 |
os.environ["OPENAI_API_KEY"] = "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"
|
|
@@ -72,27 +73,36 @@ def construct_index(directory_path):
|
|
| 72 |
|
| 73 |
## 如果是PDF文件,那么需要用如下命令。注意与txt文件的区别。切需要from llama_index import download_loader。
|
| 74 |
#NOTE: 这里可以问:give me an example of GPT-4 solving math problem. 会回答关于这个PDF中的内容,所以可以确认这个程序调用了in-context learning的功能。
|
| 75 |
-
CJKPDFReader = download_loader("CJKPDFReader")
|
| 76 |
-
loader = CJKPDFReader()
|
|
|
|
|
|
|
| 77 |
# documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
|
|
|
|
|
|
|
|
|
|
| 78 |
documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
|
|
|
|
| 79 |
# index = GPTSimpleVectorIndex(
|
| 80 |
# documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
|
| 81 |
# )
|
| 82 |
|
| 83 |
# index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) ## oringinal version, working.
|
|
|
|
| 84 |
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) #* the funciton renamed.
|
|
|
|
|
|
|
| 85 |
# index.save_to_disk('/Users/yunshi/Downloads/txt_dir/index.json') ## in the latest version, this function is not working.
|
| 86 |
|
| 87 |
return index, service_context
|
| 88 |
|
| 89 |
-
def process_file():
|
| 90 |
print('process_file starts')
|
| 91 |
-
file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
|
| 92 |
#! 第一次运行是需要开启这个function。如果测试通过index,因此不需要在运行了。记得上传PDF和JSON文件到云服务器上。
|
| 93 |
index, service_context = construct_index(file_path)
|
| 94 |
# index.storage_context.persist(persist_dir="/Users/yunshi/Downloads/txt_dir/") #* 存储到本地,为以后调用。
|
| 95 |
-
index.storage_context.persist(persist_dir=f"./") #* 存储到本地,为以后调用。
|
| 96 |
print(index)
|
| 97 |
|
| 98 |
-
process_file()
|
|
|
|
| 2 |
1.更新了llama-index的库。对应的函数名和用法都有所改变。
|
| 3 |
'''
|
| 4 |
|
| 5 |
+
# import gradio as gr
|
| 6 |
import openai
|
| 7 |
import requests
|
| 8 |
import csv
|
| 9 |
from llama_index import PromptHelper
|
| 10 |
+
# from llama_index import GPTSimpleVectorIndex ## renamed in the latest version.
|
| 11 |
from llama_index import LLMPredictor
|
| 12 |
from llama_index import ServiceContext
|
| 13 |
from langchain.chat_models import ChatOpenAI
|
|
|
|
| 20 |
import pandas as pd
|
| 21 |
import numpy as np
|
| 22 |
import PyPDF2
|
|
|
|
| 23 |
|
| 24 |
##* in the latest version: GPTSimpleVectorIndex was renamed to GPTVectorStoreIndex, try removing it from the end of your imports
|
| 25 |
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper
|
|
|
|
| 29 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
| 30 |
import sys
|
| 31 |
import os
|
| 32 |
+
from rich import print
|
| 33 |
|
| 34 |
## enironment settings.
|
| 35 |
os.environ["OPENAI_API_KEY"] = "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"
|
|
|
|
| 73 |
|
| 74 |
## 如果是PDF文件,那么需要用如下命令。注意与txt文件的区别。切需要from llama_index import download_loader。
|
| 75 |
#NOTE: 这里可以问:give me an example of GPT-4 solving math problem. 会回答关于这个PDF中的内容,所以可以确认这个程序调用了in-context learning的功能。
|
| 76 |
+
# CJKPDFReader = download_loader("CJKPDFReader") ## 最新的版本好像不行了,需要用下面的命令。
|
| 77 |
+
# loader = CJKPDFReader()
|
| 78 |
+
PDFReader = download_loader("PDFReader") # working。
|
| 79 |
+
loader = PDFReader()
|
| 80 |
# documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
|
| 81 |
+
print('directory_path now:', directory_path)
|
| 82 |
+
# print('111')
|
| 83 |
+
# documents = loader.load_data(file="/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf") #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
|
| 84 |
documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。
|
| 85 |
+
print('222')
|
| 86 |
# index = GPTSimpleVectorIndex(
|
| 87 |
# documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
|
| 88 |
# )
|
| 89 |
|
| 90 |
# index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) ## oringinal version, working.
|
| 91 |
+
# print('documents:', documents)
|
| 92 |
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) #* the funciton renamed.
|
| 93 |
+
print('333')
|
| 94 |
+
|
| 95 |
# index.save_to_disk('/Users/yunshi/Downloads/txt_dir/index.json') ## in the latest version, this function is not working.
|
| 96 |
|
| 97 |
return index, service_context
|
| 98 |
|
| 99 |
+
def process_file(file_path,username):
|
| 100 |
print('process_file starts')
|
| 101 |
+
# file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf"
|
| 102 |
#! 第一次运行是需要开启这个function。如果测试通过index,因此不需要在运行了。记得上传PDF和JSON文件到云服务器上。
|
| 103 |
index, service_context = construct_index(file_path)
|
| 104 |
# index.storage_context.persist(persist_dir="/Users/yunshi/Downloads/txt_dir/") #* 存储到本地,为以后调用。
|
| 105 |
+
index.storage_context.persist(persist_dir=f"./{username}/") #* 存储到本地,为以后调用。
|
| 106 |
print(index)
|
| 107 |
|
| 108 |
+
# process_file(file_path)
|