yasserrmd commited on
Commit
7fb740b
·
verified ·
1 Parent(s): 80a6d9f

Update extract_text_from_pdf.py

Browse files
Files changed (1) hide show
  1. extract_text_from_pdf.py +6 -6
extract_text_from_pdf.py CHANGED
@@ -8,12 +8,11 @@ from accelerate import Accelerator
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
9
  from tqdm import tqdm
10
  import warnings
11
-
12
-
13
 
14
  warnings.filterwarnings('ignore')
15
 
16
-
17
  class PDFTextExtractor:
18
  """
19
  A class to handle PDF text extraction and preprocessing for podcast preparation.
@@ -29,7 +28,8 @@ class PDFTextExtractor:
29
  model_name (str): Name of the model to use for text processing.
30
  """
31
 
32
- model_name="meta-llama/Llama-3.2-1B-Instruct"
 
33
  self.pdf_path = pdf_path
34
  self.output_path = output_path
35
  self.max_chars = 100000
@@ -38,8 +38,8 @@ class PDFTextExtractor:
38
 
39
  # Initialize model and tokenizer
40
  self.accelerator = Accelerator()
41
- self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device)
42
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
43
  self.model, self.tokenizer = self.accelerator.prepare(self.model, self.tokenizer)
44
 
45
  # System prompt for text processing
 
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
9
  from tqdm import tqdm
10
  import warnings
11
+ import spaces
 
12
 
13
  warnings.filterwarnings('ignore')
14
 
15
+ @spaces.GPU
16
  class PDFTextExtractor:
17
  """
18
  A class to handle PDF text extraction and preprocessing for podcast preparation.
 
28
  model_name (str): Name of the model to use for text processing.
29
  """
30
 
31
+ model_name="bartowski/Llama-3.2-1B-Instruct-GGUF"
32
+ filename = "Llama-3.2-1B-Instruct-Q5_K_S.gguf"
33
  self.pdf_path = pdf_path
34
  self.output_path = output_path
35
  self.max_chars = 100000
 
38
 
39
  # Initialize model and tokenizer
40
  self.accelerator = Accelerator()
41
+ self.model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=filename, torch_dtype=torch.bfloat16).to(self.device)
42
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=filename)
43
  self.model, self.tokenizer = self.accelerator.prepare(self.model, self.tokenizer)
44
 
45
  # System prompt for text processing