AllenYkl commited on
Commit
686f883
1 Parent(s): a429dbf

Update bin_public/app/llama_func.py

Browse files
Files changed (1) hide show
  1. bin_public/app/llama_func.py +16 -12
bin_public/app/llama_func.py CHANGED
@@ -15,34 +15,38 @@ from bin_public.utils.utils import *
15
 
16
  def get_documents(file_src):
17
  documents = []
18
- index_name = ""
19
  logging.debug("Loading documents...")
20
  logging.debug(f"file_src: {file_src}")
21
  for file in file_src:
22
- logging.debug(f"file: {file.name}")
23
- index_name += file.name
24
  if os.path.splitext(file.name)[1] == ".pdf":
25
  logging.debug("Loading PDF...")
26
- CJKPDFReader = download_loader("CJKPDFReader")
27
- loader = CJKPDFReader()
28
- documents += loader.load_data(file=file.name)
 
 
 
29
  elif os.path.splitext(file.name)[1] == ".docx":
30
  logging.debug("Loading DOCX...")
31
  DocxReader = download_loader("DocxReader")
32
  loader = DocxReader()
33
- documents += loader.load_data(file=file.name)
34
  elif os.path.splitext(file.name)[1] == ".epub":
35
  logging.debug("Loading EPUB...")
36
  EpubReader = download_loader("EpubReader")
37
  loader = EpubReader()
38
- documents += loader.load_data(file=file.name)
39
  else:
40
  logging.debug("Loading text file...")
41
  with open(file.name, "r", encoding="utf-8") as f:
42
- text = add_space(f.read())
43
- documents += [Document(text)]
44
- index_name = sha1sum(index_name)
45
- return documents, index_name
 
 
 
46
 
47
 
48
  def construct_index(
 
15
 
16
  def get_documents(file_src):
17
  documents = []
 
18
  logging.debug("Loading documents...")
19
  logging.debug(f"file_src: {file_src}")
20
  for file in file_src:
21
+ logging.info(f"loading file: {file.name}")
 
22
  if os.path.splitext(file.name)[1] == ".pdf":
23
  logging.debug("Loading PDF...")
24
+ pdftext = ""
25
+ with open(file.name, 'rb') as pdfFileObj:
26
+ pdfReader = PyPDF2.PdfReader(pdfFileObj)
27
+ for page in tqdm(pdfReader.pages):
28
+ pdftext += page.extract_text()
29
+ text_raw = pdftext
30
  elif os.path.splitext(file.name)[1] == ".docx":
31
  logging.debug("Loading DOCX...")
32
  DocxReader = download_loader("DocxReader")
33
  loader = DocxReader()
34
+ text_raw = loader.load_data(file=file.name)[0].text
35
  elif os.path.splitext(file.name)[1] == ".epub":
36
  logging.debug("Loading EPUB...")
37
  EpubReader = download_loader("EpubReader")
38
  loader = EpubReader()
39
+ text_raw = loader.load_data(file=file.name)[0].text
40
  else:
41
  logging.debug("Loading text file...")
42
  with open(file.name, "r", encoding="utf-8") as f:
43
+ text_raw = f.read()
44
+ text = add_space(text_raw)
45
+ # text = block_split(text)
46
+ # documents += text
47
+ documents += [Document(text)]
48
+ logging.debug("Documents loaded.")
49
+ return documents
50
 
51
 
52
  def construct_index(