Spaces:

dteam
/

chatgpt-dteam

Running

App Files Files Community

AllenYkl commited on Mar 31, 2023

Commit

f2c9c98

•

1 Parent(s): 2652cfb

Update bin_public/app/llama_func.py

Browse files

Files changed (1) hide show

bin_public/app/llama_func.py +70 -45

bin_public/app/llama_func.py CHANGED Viewed

@@ -1,4 +1,7 @@
-from llama_index import GPTSimpleVectorIndex
 from llama_index import download_loader
 from llama_index import (
     Document,
@@ -8,10 +11,33 @@ from llama_index import (
     RefinePrompt,
 )
 from langchain.llms import OpenAI
 import colorama
 from bin_public.utils.utils import *
 def get_documents(file_src):
     documents = []
@@ -50,16 +76,14 @@ def get_documents(file_src):
 def construct_index(
-    api_key,
-    file_src,
-    max_input_size=4096,
-    num_outputs=1,
-    max_chunk_overlap=20,
-    chunk_size_limit=600,
-    embedding_limit=None,
-    separator=" ",
-    num_children=10,
-    max_keywords_per_chunk=10,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
     chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
@@ -67,40 +91,40 @@ def construct_index(
     separator = " " if separator == "" else separator
     llm_predictor = LLMPredictor(
-        llm=OpenAI(model_name="gpt-3.5-turbo-0301", openai_api_key=api_key)
-    )
-    prompt_helper = PromptHelper(
-        max_input_size,
-        num_outputs,
-        max_chunk_overlap,
-        embedding_limit,
-        chunk_size_limit,
-        separator=separator,
     )
-    documents, index_name = get_documents(file_src)
     if os.path.exists(f"./index/{index_name}.json"):
         logging.info("找到了缓存的索引文件，加载中……")
         return GPTSimpleVectorIndex.load_from_disk(f"./index/{index_name}.json")
     else:
         try:
-            logging.debug("构建索引中……")
-            index = GPTSimpleVectorIndex(
-                documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
             )
-            # os.makedirs("./index", exist_ok=True)
-            # index.save_to_disk(f"./index/{index_name}.json")
             return index
         except Exception as e:
             print(e)
             return None
 def chat_ai(
-    api_key,
-    index,
-    question,
-    context,
-    chatbot,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
@@ -113,8 +137,9 @@ def chat_ai(
         replace_today(PROMPT_TEMPLATE),
         REFINE_TEMPLATE,
         SIM_K,
-        INDEX_QUERY_TEMPERATURE,
         context,
     )
     if response is None:
         status_text = "查询失败，请换个问法试试"
@@ -130,21 +155,22 @@ def chat_ai(
 def ask_ai(
-    api_key,
-    index,
-    question,
-    prompt_tmpl,
-    refine_tmpl,
-    sim_k=1,
-    temprature=0,
-    prefix_messages=[],
 ):
     os.environ["OPENAI_API_KEY"] = api_key
     logging.debug("Index file found")
     logging.debug("Querying index...")
     llm_predictor = LLMPredictor(
-        llm=OpenAI(
             temperature=temprature,
             model_name="gpt-3.5-turbo-0301",
             prefix_messages=prefix_messages,
@@ -152,11 +178,10 @@ def ask_ai(
     )
     response = None  # Initialize response variable to avoid UnboundLocalError
-    qa_prompt = QuestionAnswerPrompt(prompt_tmpl)
-    rf_prompt = RefinePrompt(refine_tmpl)
     response = index.query(
         question,
-        llm_predictor=llm_predictor,
         similarity_top_k=sim_k,
         text_qa_template=qa_prompt,
         refine_template=rf_prompt,
@@ -170,7 +195,7 @@ def ask_ai(
         for index, node in enumerate(response.source_nodes):
             brief = node.source_text[:25].replace("\n", "")
             nodes.append(
-                f"<details><summary>[{index+1}]\t{brief}...</summary><p>{node.source_text}</p></details>"
             )
         new_response = ret_text + "\n----------\n" + "\n\n".join(nodes)
         logging.info(

+import os
+import logging
+from llama_index import GPTSimpleVectorIndex, ServiceContext
 from llama_index import download_loader
 from llama_index import (
     Document,
     RefinePrompt,
 )
 from langchain.llms import OpenAI
+from langchain.chat_models import ChatOpenAI
 import colorama
+import PyPDF2
+from tqdm import tqdm
+import hashlib
+from bin_public.config.presets import *
 from bin_public.utils.utils import *
+def get_index_name(file_src):
+    file_paths = [x.name for x in file_src]
+    file_paths.sort(key=lambda x: os.path.basename(x))
+    md5_hash = hashlib.md5()
+    for file_path in file_paths:
+        with open(file_path, "rb") as f:
+            while chunk := f.read(8192):
+                md5_hash.update(chunk)
+    return md5_hash.hexdigest()
+def block_split(text):
+    blocks = []
+    while len(text) > 0:
+        blocks.append(Document(text[:1000]))
+        text = text[1000:]
+    return blocks
 def get_documents(file_src):
     documents = []
 def construct_index(
+        api_key,
+        file_src,
+        max_input_size=4096,
+        num_outputs=5,
+        max_chunk_overlap=20,
+        chunk_size_limit=600,
+        embedding_limit=None,
+        separator=" "
 ):
     os.environ["OPENAI_API_KEY"] = api_key
     chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
     separator = " " if separator == "" else separator
     llm_predictor = LLMPredictor(
+        llm=ChatOpenAI(model_name="gpt-3.5-turbo-0301", openai_api_key=api_key)
     )
+    prompt_helper = PromptHelper(max_input_size = max_input_size, num_output = num_outputs, max_chunk_overlap = max_chunk_overlap, embedding_limit=embedding_limit, chunk_size_limit=600, separator=separator)
+    index_name = get_index_name(file_src)
     if os.path.exists(f"./index/{index_name}.json"):
         logging.info("找到了缓存的索引文件，加载中……")
         return GPTSimpleVectorIndex.load_from_disk(f"./index/{index_name}.json")
     else:
         try:
+            documents = get_documents(file_src)
+            logging.info("构建索引中……")
+            service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, chunk_size_limit=chunk_size_limit)
+            index = GPTSimpleVectorIndex.from_documents(
+                documents,  service_context=service_context
             )
+            logging.debug("索引构建完成！")
+            os.makedirs("./index", exist_ok=True)
+            index.save_to_disk(f"./index/{index_name}.json")
+            logging.debug("索引已保存至本地!")
             return index
         except Exception as e:
+            logging.error("索引构建失败！", e)
             print(e)
             return None
 def chat_ai(
+        api_key,
+        index,
+        question,
+        context,
+        chatbot,
+        reply_language,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
         replace_today(PROMPT_TEMPLATE),
         REFINE_TEMPLATE,
         SIM_K,
+        1.0,
         context,
+        reply_language,
     )
     if response is None:
         status_text = "查询失败，请换个问法试试"
 def ask_ai(
+        api_key,
+        index,
+        question,
+        prompt_tmpl,
+        refine_tmpl,
+        sim_k=5,
+        temprature=0,
+        prefix_messages=[],
+        reply_language="中文",
 ):
     os.environ["OPENAI_API_KEY"] = api_key
     logging.debug("Index file found")
     logging.debug("Querying index...")
     llm_predictor = LLMPredictor(
+        llm=ChatOpenAI(
             temperature=temprature,
             model_name="gpt-3.5-turbo-0301",
             prefix_messages=prefix_messages,
     )
     response = None  # Initialize response variable to avoid UnboundLocalError
+    qa_prompt = QuestionAnswerPrompt(prompt_tmpl.replace("{reply_language}", reply_language))
+    rf_prompt = RefinePrompt(refine_tmpl.replace("{reply_language}", reply_language))
     response = index.query(
         question,
         similarity_top_k=sim_k,
         text_qa_template=qa_prompt,
         refine_template=rf_prompt,
         for index, node in enumerate(response.source_nodes):
             brief = node.source_text[:25].replace("\n", "")
             nodes.append(
+                f"<details><summary>[{index + 1}]\t{brief}...</summary><p>{node.source_text}</p></details>"
             )
         new_response = ret_text + "\n----------\n" + "\n\n".join(nodes)
         logging.info(