Spaces:

terapyon
/

nvdajp-book-qa

Paused

App Files Files Community

terapyon commited on Aug 3, 2023

Commit

8034497

•

1 Parent(s): c959977

dev/modify-load-llm (#9)

Browse files

- modify load timing of model (ff920d1f88f3f44ef23ace448c056fa1a8d226e3)

Files changed (1) hide show

app.py +26 -20

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# from time import time
 import gradio as gr
 from langchain.chains import RetrievalQA
 from langchain.embeddings import OpenAIEmbeddings
@@ -16,17 +16,29 @@ from qdrant_client import QdrantClient
 from config import DB_CONFIG, DB_E5_CONFIG
 def _get_config_and_embeddings(collection_name: str | None) -> tuple:
     if collection_name is None or collection_name == "E5":
         db_config = DB_E5_CONFIG
-        model_name = "intfloat/multilingual-e5-large"
-        model_kwargs = {"device": "cpu"}
-        encode_kwargs = {"normalize_embeddings": False}
-        embeddings = HuggingFaceEmbeddings(
-            model_name=model_name,
-            model_kwargs=model_kwargs,
-            encode_kwargs=encode_kwargs,
-        )
     elif collection_name == "OpenAI":
         db_config = DB_CONFIG
         embeddings = OpenAIEmbeddings()
@@ -36,18 +48,10 @@ def _get_config_and_embeddings(collection_name: str | None) -> tuple:
 def _get_rinna_llm(temperature: float):
-    model = "rinna/bilingual-gpt-neox-4b-instruction-ppo"
-    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    model = AutoModelForCausalLM.from_pretrained(
-        model,
-        load_in_8bit=True,
-        torch_dtype=torch.float16,
-        device_map="auto",
-    )
     pipe = pipeline(
         "text-generation",
-        model=model,
-        tokenizer=tokenizer,
         max_new_tokens=1024,
         temperature=temperature,
     )
@@ -139,6 +143,7 @@ def get_related_url(metadata):
 def main(
     query: str, collection_name: str, model_name: str, option: str, temperature: float
 ):
     qa = get_retrieval_qa(collection_name, model_name, temperature, option)
     try:
         result = qa(query)
@@ -146,7 +151,8 @@ def main(
         return "回答が見つかりませんでした。別な質問をしてみてください", str(e)
     else:
         metadata = [s.metadata for s in result["source_documents"]]
-        html = "<div>" + "\n".join(get_related_url(metadata)) + "</div>"
     return result["result"], html

+from time import time
 import gradio as gr
 from langchain.chains import RetrievalQA
 from langchain.embeddings import OpenAIEmbeddings
 from config import DB_CONFIG, DB_E5_CONFIG
+E5_MODEL_NAME = "intfloat/multilingual-e5-large"
+E5_MODEL_KWARGS = {"device": "cuda:0" if torch.cuda.is_available() else "cpu"}
+E5_ENCODE_KWARGS = {"normalize_embeddings": False}
+E5_EMBEDDINGS = HuggingFaceEmbeddings(
+    model_name=E5_MODEL_NAME,
+    model_kwargs=E5_MODEL_KWARGS,
+    encode_kwargs=E5_ENCODE_KWARGS,
+)
+RINNA_MODEL_NAME = "rinna/bilingual-gpt-neox-4b-instruction-ppo"
+RINNA_TOKENIZER = AutoTokenizer.from_pretrained(RINNA_MODEL_NAME, use_fast=False)
+RINNA_MODEL = AutoModelForCausalLM.from_pretrained(
+    RINNA_MODEL_NAME,
+    load_in_8bit=True,
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
 def _get_config_and_embeddings(collection_name: str | None) -> tuple:
     if collection_name is None or collection_name == "E5":
         db_config = DB_E5_CONFIG
+        embeddings = E5_EMBEDDINGS
     elif collection_name == "OpenAI":
         db_config = DB_CONFIG
         embeddings = OpenAIEmbeddings()
 def _get_rinna_llm(temperature: float):
     pipe = pipeline(
         "text-generation",
+        model=RINNA_MODEL,
+        tokenizer=RINNA_TOKENIZER,
         max_new_tokens=1024,
         temperature=temperature,
     )
 def main(
     query: str, collection_name: str, model_name: str, option: str, temperature: float
 ):
+    now = time()
     qa = get_retrieval_qa(collection_name, model_name, temperature, option)
     try:
         result = qa(query)
         return "回答が見つかりませんでした。別な質問をしてみてください", str(e)
     else:
         metadata = [s.metadata for s in result["source_documents"]]
+        sec_html = f"<p>実行時間: {(time() - now):.2f}秒</p>"
+        html = "<div>" + sec_html + "\n".join(get_related_url(metadata)) + "</div>"
     return result["result"], html