Teapack1 commited on
Commit
1f4bbb8
1 Parent(s): 8335df5

standerd commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ stores/english_512/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: IoT Flask Webserver
3
- emoji: 📟
4
- colorFrom: blue
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 2.9.1
8
  python_version: 3.10.4
 
1
  ---
2
+ title: RAG Retriever Eng Czech
3
+ emoji: 📚
4
+ colorFrom: yellow
5
+ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 2.9.1
8
  python_version: 3.10.4
fast_app.py CHANGED
@@ -39,35 +39,61 @@ if openai_api_key is None:
39
  app = FastAPI()
40
  templates = Jinja2Templates(directory="templates")
41
  app.mount("/static", StaticFiles(directory="static"), name="static")
 
 
42
 
43
  czech_store = "stores/czech_512"
44
- english_store = "stores/english_256"
45
 
46
  ingestor = Ingest(
47
  openai_api_key=openai_api_key,
48
- chunk=256,
49
- overlap=128,
50
  czech_store=czech_store,
51
  english_store=english_store,
 
 
52
  )
53
 
54
- load_dotenv()
 
55
 
56
- prompt_template = """You are a electrical engineer focused on lighting and chandeliers. Provide helpful answer to the user question.
57
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
 
58
 
59
- Context: {context}
60
- Question: {question}
61
 
62
- Only return the helpful answer below and nothing else.
63
- Helpful answer:
64
- """
 
 
 
 
 
65
 
66
- prompt = PromptTemplate(
67
- template=prompt_template, input_variables=["context", "question"]
68
- )
 
 
 
 
69
 
70
- print("\n Prompt ready... \n\n")
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  @app.get("/", response_class=HTMLResponse)
@@ -96,8 +122,9 @@ async def ingest_data(folderPath: str = Form(...), language: str = Form(...)):
96
  async def get_response(query: str = Form(...), language: str = Form(...)):
97
  print(language)
98
  if language == "czech":
 
99
  print("\n Czech language selected....\n\n")
100
- embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
101
  persist_directory = czech_store
102
  model_name = embedding_model
103
  model_kwargs = {"device": "cpu"}
@@ -108,8 +135,9 @@ async def get_response(query: str = Form(...), language: str = Form(...)):
108
  encode_kwargs=encode_kwargs,
109
  )
110
  else:
 
111
  print("\n English language selected....\n\n")
112
- embedding_model = "text-embedding-3-large" # Default to English
113
  persist_directory = english_store
114
  embedding = OpenAIEmbeddings(
115
  openai_api_key=openai_api_key,
@@ -117,7 +145,7 @@ async def get_response(query: str = Form(...), language: str = Form(...)):
117
  )
118
 
119
  vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
120
- retriever = vectordb.as_retriever(search_kwargs={"k": 10})
121
 
122
  chain_type_kwargs = {"prompt": prompt}
123
  qa_chain = RetrievalQA.from_chain_type(
 
39
  app = FastAPI()
40
  templates = Jinja2Templates(directory="templates")
41
  app.mount("/static", StaticFiles(directory="static"), name="static")
42
+ english_embedding_model="text-embedding-3-large"
43
+ czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en"
44
 
45
  czech_store = "stores/czech_512"
46
+ english_store = "stores/english_512"
47
 
48
  ingestor = Ingest(
49
  openai_api_key=openai_api_key,
50
+ chunk=512,
51
+ overlap=256,
52
  czech_store=czech_store,
53
  english_store=english_store,
54
+ czech_embedding_model=czech_embedding_model,
55
+ english_embedding_model=english_embedding_model,
56
  )
57
 
58
+ def prompt_en():
59
+ prompt_template_en = """You are electrical engineer and you answer users ###Question.
60
 
61
+ #Your answer has to be helpful, relevant and closely related to the user's ###Question.
62
+ #Provide as much literal information and transcription from the #Context as possible.
63
+ #Only use your own words to connect, clarify or explain the information!
64
+ #If you don't know the answer, just say that you don't know, don't try to make up an answer.
65
 
66
+ ###Context: {context}
67
+ ###Question: {question}
68
 
69
+ Only return the helpful answer below and nothing else.
70
+ Helpful answer:
71
+ """
72
+ prompt_en = PromptTemplate(
73
+ template=prompt_template_en, input_variables=["context", "question"]
74
+ )
75
+ print("\n Prompt ready... \n\n")
76
+ return prompt_en
77
 
78
+ def prompt_cz():
79
+ prompt_template_cz = """Jste elektroinženýr a odpovídáte uživatelům na ###Otázku.
80
+
81
+ #Vaše odpověď musí být užitečná, relevantní a úzce souviset s uživatelovou ###Otázkou.
82
+ #Poskytněte co nejvíce doslovných informací a přepisů z #Kontextu.
83
+ #Použijte vlastní slova pouze pro spojení, objasnění nebo vysvětlení informací!
84
+ #Pokud odpověď neznáte, prostě řekněte, že to nevíte, nepokoušejte se vymýšlet odpověď.
85
 
86
+ ###Kontext: {context}
87
+ ###Otázka: {question}
88
+
89
+ Níže vraťte pouze užitečnou odpověď a nic jiného.
90
+ Užitečná odpověď:
91
+ """
92
+ prompt_cz = PromptTemplate(
93
+ template=prompt_template_cz, input_variables=["context", "question"]
94
+ )
95
+ print("\n Prompt ready... \n\n")
96
+ return prompt_cz
97
 
98
 
99
  @app.get("/", response_class=HTMLResponse)
 
122
  async def get_response(query: str = Form(...), language: str = Form(...)):
123
  print(language)
124
  if language == "czech":
125
+ prompt = prompt_cz()
126
  print("\n Czech language selected....\n\n")
127
+ embedding_model = czech_embedding_model
128
  persist_directory = czech_store
129
  model_name = embedding_model
130
  model_kwargs = {"device": "cpu"}
 
135
  encode_kwargs=encode_kwargs,
136
  )
137
  else:
138
+ prompt = prompt_en()
139
  print("\n English language selected....\n\n")
140
+ embedding_model = english_embedding_model # Default to English
141
  persist_directory = english_store
142
  embedding = OpenAIEmbeddings(
143
  openai_api_key=openai_api_key,
 
145
  )
146
 
147
  vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
148
+ retriever = vectordb.as_retriever(search_kwargs={"k": 3})
149
 
150
  chain_type_kwargs = {"prompt": prompt}
151
  qa_chain = RetrievalQA.from_chain_type(
ingest.py CHANGED
@@ -20,6 +20,8 @@ class Ingest:
20
  english_store="stores/english_512",
21
  data_czech="data/czech",
22
  data_english="data/english",
 
 
23
  ):
24
  self.openai_api_key = openai_api_key
25
  self.chunk = chunk
@@ -28,17 +30,20 @@ class Ingest:
28
  self.english_store = english_store
29
  self.data_czech = data_czech
30
  self.data_english = data_english
 
 
31
 
32
  def ingest_english(self):
33
 
34
  embedding = OpenAIEmbeddings(
35
  openai_api_key=self.openai_api_key,
36
- model="text-embedding-3-large",
37
  )
38
 
39
  loader = DirectoryLoader(
40
  self.data_english,
41
  show_progress=True,
 
42
  )
43
 
44
  documents = loader.load()
@@ -58,7 +63,7 @@ class Ingest:
58
  print("\n English vector Store Created.......\n\n")
59
 
60
  def ingest_czech(self):
61
- embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
62
  model_kwargs = {"device": "cpu"}
63
  encode_kwargs = {"normalize_embeddings": False}
64
  embedding = HuggingFaceEmbeddings(
 
20
  english_store="stores/english_512",
21
  data_czech="data/czech",
22
  data_english="data/english",
23
+ english_embedding_model="text-embedding-3-large",
24
+ czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en",
25
  ):
26
  self.openai_api_key = openai_api_key
27
  self.chunk = chunk
 
30
  self.english_store = english_store
31
  self.data_czech = data_czech
32
  self.data_english = data_english
33
+ self.english_embedding_model = english_embedding_model
34
+ self.czech_embedding_model = czech_embedding_model
35
 
36
  def ingest_english(self):
37
 
38
  embedding = OpenAIEmbeddings(
39
  openai_api_key=self.openai_api_key,
40
+ model=self.english_embedding_model,
41
  )
42
 
43
  loader = DirectoryLoader(
44
  self.data_english,
45
  show_progress=True,
46
+ loader_cls=PyPDFLoader,
47
  )
48
 
49
  documents = loader.load()
 
63
  print("\n English vector Store Created.......\n\n")
64
 
65
  def ingest_czech(self):
66
+ embedding_model = self.czech_embedding_model
67
  model_kwargs = {"device": "cpu"}
68
  encode_kwargs = {"normalize_embeddings": False}
69
  embedding = HuggingFaceEmbeddings(
static/dummy.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ dummy
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f8157971983f837eca48b97187f0e8a435eb21270cd49d831db21678670bc4a
3
+ size 1164000
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a3499aedbeb5c8ea26813ed567be6748293334099aa733c4d8cf0c4ec0ee6e3
3
+ size 100
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:612e017796cdd9eef6ba562cbe8c02e16b8c07f3fbac9f1254934f02e2261084
3
+ size 4000
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin ADDED
File without changes
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f812eacc9c05db367748cf1e0576bdcd28e0b3eaf09d5f3095a1b0e03f71cc8
3
+ size 12428000
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9882e5d786d4ca5fba4a783054685cf6e05b1637aaf586e43ec0e933e30e961d
3
+ size 100
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d49c7e9538b2cfc154773a96a1fcdbf4a4247c3b510bb68d2aa6f2b24e902fca
3
+ size 55974
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd6e73e535a8843ce30d35a4ba88436bcb5687583474e276a3b1f8689c1477bd
3
+ size 4000
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe35f087195e70122f597edc9b62da9d3ce370b40307b5556ebbe4e185fb46d4
3
+ size 8624
templates/index.html CHANGED
@@ -192,9 +192,11 @@
192
  <!-- Example Queries Section -->
193
  <div id="exampleQueries" class="mb-3">
194
  <h2 class="h5">Try Example Queries:</h2>
195
- <button class="btn btn-sm btn-secondary example-query">What cable can you use to hang a pendant light on?</button>
196
  <button class="btn btn-sm btn-secondary example-query">What is the minimal gauge of live wires ?</button>
197
  <button class="btn btn-sm btn-secondary example-query">What flamability fequirements do plastic enclosure have to meet ?</button>
 
 
198
  </div>
199
 
200
  <div class="row">
 
192
  <!-- Example Queries Section -->
193
  <div id="exampleQueries" class="mb-3">
194
  <h2 class="h5">Try Example Queries:</h2>
195
+ <button class="btn btn-sm btn-secondary example-query">What cable do I use to hang a 1.5kg heavy luminaire on?</button>
196
  <button class="btn btn-sm btn-secondary example-query">What is the minimal gauge of live wires ?</button>
197
  <button class="btn btn-sm btn-secondary example-query">What flamability fequirements do plastic enclosure have to meet ?</button>
198
+ <button class="btn btn-sm btn-secondary example-query">Jaké parametry musí splňovat kovový kryt živých částí ?</button>
199
+
200
  </div>
201
 
202
  <div class="row">