Spaces:

Teapack1
/

RAG-Retrieve-Ingest-cz-eng

Sleeping

App Files Files Community

Teapack1 commited on Feb 17, 2024

Commit

1f4bbb8

1 Parent(s): 8335df5

standerd commit

Browse files

Files changed (15) hide show

.gitattributes +2 -0
README.md +4 -4
fast_app.py +46 -18
ingest.py +7 -2
static/dummy.txt +1 -0
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin +3 -0
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin +3 -0
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin +3 -0
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin +0 -0
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin +3 -0
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin +3 -0
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle +3 -0
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin +3 -0
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin +3 -0
templates/index.html +3 -1

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.sqlite3 filter=lfs diff=lfs merge=lfs -text
+stores/english_512/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: IoT Flask Webserver
-emoji: 📟
-colorFrom: blue
-colorTo: indigo
 sdk: gradio
 sdk_version: 2.9.1
 python_version: 3.10.4

 ---
+title: RAG Retriever Eng Czech
+emoji: 📚
+colorFrom: yellow
+colorTo: gray
 sdk: gradio
 sdk_version: 2.9.1
 python_version: 3.10.4

fast_app.py CHANGED Viewed

@@ -39,35 +39,61 @@ if openai_api_key is None:
 app = FastAPI()
 templates = Jinja2Templates(directory="templates")
 app.mount("/static", StaticFiles(directory="static"), name="static")
 czech_store = "stores/czech_512"
-english_store = "stores/english_256"
 ingestor = Ingest(
     openai_api_key=openai_api_key,
-    chunk=256,
-    overlap=128,
     czech_store=czech_store,
     english_store=english_store,
 )
-load_dotenv()
-prompt_template = """You are a electrical engineer focused on lighting and chandeliers. Provide helpful answer to the user question.
-If you don't know the answer, just say that you don't know, don't try to make up an answer.
-Context: {context}
-Question: {question}
-Only return the helpful answer below and nothing else.
-Helpful answer:
-"""
-prompt = PromptTemplate(
-    template=prompt_template, input_variables=["context", "question"]
-)
-print("\n Prompt ready... \n\n")
 @app.get("/", response_class=HTMLResponse)
@@ -96,8 +122,9 @@ async def ingest_data(folderPath: str = Form(...), language: str = Form(...)):
 async def get_response(query: str = Form(...), language: str = Form(...)):
     print(language)
     if language == "czech":
         print("\n Czech language selected....\n\n")
-        embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
         persist_directory = czech_store
         model_name = embedding_model
         model_kwargs = {"device": "cpu"}
@@ -108,8 +135,9 @@ async def get_response(query: str = Form(...), language: str = Form(...)):
             encode_kwargs=encode_kwargs,
         )
     else:
         print("\n English language selected....\n\n")
-        embedding_model = "text-embedding-3-large"  # Default to English
         persist_directory = english_store
         embedding = OpenAIEmbeddings(
             openai_api_key=openai_api_key,
@@ -117,7 +145,7 @@ async def get_response(query: str = Form(...), language: str = Form(...)):
         )
     vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
-    retriever = vectordb.as_retriever(search_kwargs={"k": 10})
     chain_type_kwargs = {"prompt": prompt}
     qa_chain = RetrievalQA.from_chain_type(

 app = FastAPI()
 templates = Jinja2Templates(directory="templates")
 app.mount("/static", StaticFiles(directory="static"), name="static")
+english_embedding_model="text-embedding-3-large"
+czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en"
 czech_store = "stores/czech_512"
+english_store = "stores/english_512"
 ingestor = Ingest(
     openai_api_key=openai_api_key,
+    chunk=512,
+    overlap=256,
     czech_store=czech_store,
     english_store=english_store,
+    czech_embedding_model=czech_embedding_model,
+    english_embedding_model=english_embedding_model,
 )
+def prompt_en():
+    prompt_template_en = """You are electrical engineer and you answer users ###Question.
+    #Your answer has to be helpful, relevant and closely related to the user's ###Question.
+    #Provide as much literal information and transcription from the #Context as possible.
+    #Only use your own words to connect, clarify or explain the information!
+    #If you don't know the answer, just say that you don't know, don't try to make up an answer.
+    ###Context: {context}
+    ###Question: {question}
+    Only return the helpful answer below and nothing else.
+    Helpful answer:
+    """
+    prompt_en = PromptTemplate(
+        template=prompt_template_en, input_variables=["context", "question"]
+    )
+    print("\n Prompt ready... \n\n")
+    return prompt_en
+def prompt_cz():
+    prompt_template_cz = """Jste elektroinženýr a odpovídáte uživatelům na ###Otázku.
+    #Vaše odpověď musí být užitečná, relevantní a úzce souviset s uživatelovou ###Otázkou.
+    #Poskytněte co nejvíce doslovných informací a přepisů z #Kontextu.
+    #Použijte vlastní slova pouze pro spojení, objasnění nebo vysvětlení informací!
+    #Pokud odpověď neznáte, prostě řekněte, že to nevíte, nepokoušejte se vymýšlet odpověď.
+    ###Kontext: {context}
+    ###Otázka: {question}
+    Níže vraťte pouze užitečnou odpověď a nic jiného.
+    Užitečná odpověď:
+    """
+    prompt_cz = PromptTemplate(
+        template=prompt_template_cz, input_variables=["context", "question"]
+    )
+    print("\n Prompt ready... \n\n")
+    return prompt_cz
 @app.get("/", response_class=HTMLResponse)
 async def get_response(query: str = Form(...), language: str = Form(...)):
     print(language)
     if language == "czech":
+        prompt = prompt_cz()
         print("\n Czech language selected....\n\n")
+        embedding_model = czech_embedding_model
         persist_directory = czech_store
         model_name = embedding_model
         model_kwargs = {"device": "cpu"}
             encode_kwargs=encode_kwargs,
         )
     else:
+        prompt = prompt_en()
         print("\n English language selected....\n\n")
+        embedding_model = english_embedding_model  # Default to English
         persist_directory = english_store
         embedding = OpenAIEmbeddings(
             openai_api_key=openai_api_key,
         )
     vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
+    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
     chain_type_kwargs = {"prompt": prompt}
     qa_chain = RetrievalQA.from_chain_type(

ingest.py CHANGED Viewed

@@ -20,6 +20,8 @@ class Ingest:
         english_store="stores/english_512",
         data_czech="data/czech",
         data_english="data/english",
     ):
         self.openai_api_key = openai_api_key
         self.chunk = chunk
@@ -28,17 +30,20 @@ class Ingest:
         self.english_store = english_store
         self.data_czech = data_czech
         self.data_english = data_english
     def ingest_english(self):
         embedding = OpenAIEmbeddings(
             openai_api_key=self.openai_api_key,
-            model="text-embedding-3-large",
         )
         loader = DirectoryLoader(
             self.data_english,
             show_progress=True,
         )
         documents = loader.load()
@@ -58,7 +63,7 @@ class Ingest:
         print("\n English vector Store Created.......\n\n")
     def ingest_czech(self):
-        embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
         model_kwargs = {"device": "cpu"}
         encode_kwargs = {"normalize_embeddings": False}
         embedding = HuggingFaceEmbeddings(

         english_store="stores/english_512",
         data_czech="data/czech",
         data_english="data/english",
+        english_embedding_model="text-embedding-3-large",
+        czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en",
     ):
         self.openai_api_key = openai_api_key
         self.chunk = chunk
         self.english_store = english_store
         self.data_czech = data_czech
         self.data_english = data_english
+        self.english_embedding_model = english_embedding_model
+        self.czech_embedding_model = czech_embedding_model
     def ingest_english(self):
         embedding = OpenAIEmbeddings(
             openai_api_key=self.openai_api_key,
+            model=self.english_embedding_model,
         )
         loader = DirectoryLoader(
             self.data_english,
             show_progress=True,
+            loader_cls=PyPDFLoader,
         )
         documents = loader.load()
         print("\n English vector Store Created.......\n\n")
     def ingest_czech(self):
+        embedding_model = self.czech_embedding_model
         model_kwargs = {"device": "cpu"}
         encode_kwargs = {"normalize_embeddings": False}
         embedding = HuggingFaceEmbeddings(

static/dummy.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ dummy

stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f8157971983f837eca48b97187f0e8a435eb21270cd49d831db21678670bc4a
+size 1164000

stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a3499aedbeb5c8ea26813ed567be6748293334099aa733c4d8cf0c4ec0ee6e3
+size 100

stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:612e017796cdd9eef6ba562cbe8c02e16b8c07f3fbac9f1254934f02e2261084
+size 4000

stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin ADDED Viewed

File without changes

stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f812eacc9c05db367748cf1e0576bdcd28e0b3eaf09d5f3095a1b0e03f71cc8
+size 12428000

stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9882e5d786d4ca5fba4a783054685cf6e05b1637aaf586e43ec0e933e30e961d
+size 100

stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d49c7e9538b2cfc154773a96a1fcdbf4a4247c3b510bb68d2aa6f2b24e902fca
+size 55974

stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd6e73e535a8843ce30d35a4ba88436bcb5687583474e276a3b1f8689c1477bd
+size 4000

stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe35f087195e70122f597edc9b62da9d3ce370b40307b5556ebbe4e185fb46d4
+size 8624

templates/index.html CHANGED Viewed

@@ -192,9 +192,11 @@
         <!-- Example Queries Section -->
         <div id="exampleQueries" class="mb-3">
             <h2 class="h5">Try Example Queries:</h2>
-            <button class="btn btn-sm btn-secondary example-query">What cable can you use to hang a pendant light on?</button>
             <button class="btn btn-sm btn-secondary example-query">What is the minimal gauge of live wires ?</button>
             <button class="btn btn-sm btn-secondary example-query">What flamability fequirements do plastic enclosure have to meet ?</button>
         </div>
         <div class="row">

         <!-- Example Queries Section -->
         <div id="exampleQueries" class="mb-3">
             <h2 class="h5">Try Example Queries:</h2>
+            <button class="btn btn-sm btn-secondary example-query">What cable do I use to hang a 1.5kg heavy luminaire on?</button>
             <button class="btn btn-sm btn-secondary example-query">What is the minimal gauge of live wires ?</button>
             <button class="btn btn-sm btn-secondary example-query">What flamability fequirements do plastic enclosure have to meet ?</button>
+            <button class="btn btn-sm btn-secondary example-query">Jaké parametry musí splňovat kovový kryt živých částí ?</button>
         </div>
         <div class="row">