alexkueck commited on
Commit
fca5889
1 Parent(s): 56267d8

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +48 -40
utils.py CHANGED
@@ -186,8 +186,6 @@ modell_rag = DistilBertForQuestionAnswering.from_pretrained(HF_MODELL)
186
  tokenizer_rag = DistilBertTokenizer.from_pretrained(HF_MODELL)
187
  qa_pipeline = pipeline("question-answering", model=modell_rag, tokenizer=tokenizer_rag)
188
 
189
-
190
-
191
  HF_MODELL ="EleutherAI/gpt-neo-2.7B"
192
  modell_rag = GPTNeoForCausalLM.from_pretrained(HF_MODELL)
193
  tokenizer_rag = GPT2Tokenizer.from_pretrained(HF_MODELL)
@@ -280,7 +278,8 @@ def access_pdf(self, filename):
280
  )
281
 
282
  return temp_path
283
-
 
284
  #besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
285
  def create_custom_loader(file_type, file_list): #create_directory_loader(file_type, directory_path):
286
  loaders = {
@@ -290,7 +289,7 @@ def create_custom_loader(file_type, file_list): #create_directory_loader(file_t
290
 
291
 
292
  class CustomLoader:
293
- """
294
  def __init__(self, directory_path, file_type, loader_func):
295
  self.directory_path = directory_path
296
  self.file_type = file_type
@@ -306,35 +305,14 @@ def create_custom_loader(file_type, file_list): #create_directory_loader(file_t
306
  return documents
307
 
308
  return CustomLoader(directory_path, file_type, loaders[file_type])
309
- """
310
-
311
- def __init__(self, file_type, file_list, loader_func):
312
- self.file_type = file_type
313
- self.file_list = file_list
314
- self.loader_func = loader_func
315
-
316
- def load(self):
317
- documents = []
318
- for file_path in self.file_list:
319
- with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
320
- temp_path = temp_file.name
321
-
322
- # Datei aus dem Hugging Face Space herunterladen
323
- hf_hub_download(
324
- repo_id=STORAGE_REPO_ID,
325
- filename=file_path,
326
- repo_type="space",
327
- local_dir=os.path.dirname(temp_path),
328
- local_dir_use_symlinks=False,
329
- token=hf_token
330
- )
331
-
332
- documents.extend(self.loader_func(temp_path))
333
-
334
- # Temporäre Datei löschen
335
- os.unlink(temp_path)
336
- return documents
337
 
 
 
 
 
 
 
338
  return CustomLoader(file_type, file_list, loaders[file_type])
339
 
340
  ################################################
@@ -906,6 +884,15 @@ def get_filename(file_pfad):
906
  return result
907
 
908
 
 
 
 
 
 
 
 
 
 
909
  #################################################
910
  #Klasse mit zuständen - z.B. für interrupt wenn Stop gedrückt...
911
  #################################################
@@ -932,14 +919,35 @@ class Document:
932
  }
933
 
934
 
935
- def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
936
- for stop_word in stop_words:
937
- if s.endswith(stop_word):
938
- return True
939
- for i in range(1, len(stop_word)):
940
- if s.endswith(stop_word[:i]):
941
- return True
942
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
943
 
944
 
945
 
 
186
  tokenizer_rag = DistilBertTokenizer.from_pretrained(HF_MODELL)
187
  qa_pipeline = pipeline("question-answering", model=modell_rag, tokenizer=tokenizer_rag)
188
 
 
 
189
  HF_MODELL ="EleutherAI/gpt-neo-2.7B"
190
  modell_rag = GPTNeoForCausalLM.from_pretrained(HF_MODELL)
191
  tokenizer_rag = GPT2Tokenizer.from_pretrained(HF_MODELL)
 
278
  )
279
 
280
  return temp_path
281
+
282
+ """
283
  #besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
284
  def create_custom_loader(file_type, file_list): #create_directory_loader(file_type, directory_path):
285
  loaders = {
 
289
 
290
 
291
  class CustomLoader:
292
+
293
  def __init__(self, directory_path, file_type, loader_func):
294
  self.directory_path = directory_path
295
  self.file_type = file_type
 
305
  return documents
306
 
307
  return CustomLoader(directory_path, file_type, loaders[file_type])
308
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
+
311
+ def create_custom_loader(file_type, file_list):
312
+ loaders = {
313
+ '.pdf': load_pdf_with_metadata,
314
+ '.docx': load_word_with_metadata,
315
+ }
316
  return CustomLoader(file_type, file_list, loaders[file_type])
317
 
318
  ################################################
 
884
  return result
885
 
886
 
887
+ def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
888
+ for stop_word in stop_words:
889
+ if s.endswith(stop_word):
890
+ return True
891
+ for i in range(1, len(stop_word)):
892
+ if s.endswith(stop_word[:i]):
893
+ return True
894
+ return False
895
+
896
  #################################################
897
  #Klasse mit zuständen - z.B. für interrupt wenn Stop gedrückt...
898
  #################################################
 
919
  }
920
 
921
 
922
+ ##########################################
923
+ #Class für die Directory Loader - um sie anzupassen
924
+ ##########################################
925
+ class CustomLoader:
926
+ def __init__(self, file_type, file_list, loader_func):
927
+ self.file_type = file_type
928
+ self.file_list = file_list
929
+ self.loader_func = loader_func
930
+
931
+ def load(self):
932
+ documents = []
933
+ for file_path in self.file_list:
934
+ with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
935
+ temp_path = temp_file.name
936
+
937
+ # Datei aus dem Hugging Face Space herunterladen
938
+ hf_hub_download(
939
+ repo_id=STORAGE_REPO_ID,
940
+ filename=file_path,
941
+ repo_type="space",
942
+ local_dir=os.path.dirname(temp_path),
943
+ local_dir_use_symlinks=False,
944
+ token=hf_token
945
+ )
946
+ documents.extend(self.loader_func(temp_path))
947
+ os.unlink(temp_path)
948
+ return documents
949
+
950
+
951
 
952
 
953