yasserrmd commited on
Commit
bf8498e
1 Parent(s): 71df28b

Create extract_text_from_pdf.py

Browse files
Files changed (1) hide show
  1. extract_text_from_pdf.py +144 -0
extract_text_from_pdf.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # extract_text_from_pdf.py
2
+
3
+ import os
4
+ import torch
5
+ from PyPDF2 import PdfReader
6
+ from accelerate import Accelerator
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer
8
+ from tqdm import tqdm
9
+ import warnings
10
+
11
+ warnings.filterwarnings('ignore')
12
+
13
+
14
+ class PDFTextExtractor:
15
+ """
16
+ A class to handle PDF text extraction and preprocessing for podcast preparation.
17
+ """
18
+
19
+ def __init__(self, pdf_path, output_path='./resources/clean_text.txt', model_name="meta-llama/Llama-3.2-1B-Instruct"):
20
+ """
21
+ Initialize the PDFTextExtractor with paths and model details.
22
+
23
+ Args:
24
+ pdf_path (str): Path to the PDF file.
25
+ output_path (str): Path to save the cleaned text file.
26
+ model_name (str): Name of the model to use for text processing.
27
+ """
28
+ self.pdf_path = pdf_path
29
+ self.output_path = output_path
30
+ self.max_chars = 100000
31
+ self.chunk_size = 1000
32
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
33
+
34
+ # Initialize model and tokenizer
35
+ self.accelerator = Accelerator()
36
+ self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device)
37
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+ self.model, self.tokenizer = self.accelerator.prepare(self.model, self.tokenizer)
39
+
40
+ # System prompt for text processing
41
+ self.system_prompt = """
42
+ You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.
43
+
44
+ Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
45
+ Here is the text:
46
+ """
47
+
48
+ def validate_pdf(self):
49
+ """Check if the file exists and is a valid PDF."""
50
+ if not os.path.exists(self.pdf_path):
51
+ print(f"Error: File not found at path: {self.pdf_path}")
52
+ return False
53
+ if not self.pdf_path.lower().endswith('.pdf'):
54
+ print("Error: File is not a PDF")
55
+ return False
56
+ return True
57
+
58
+ def extract_text(self):
59
+ """Extract text from the PDF, limited by max_chars."""
60
+ if not self.validate_pdf():
61
+ return None
62
+
63
+ with open(self.pdf_path, 'rb') as file:
64
+ pdf_reader = PdfReader(file)
65
+ num_pages = len(pdf_reader.pages)
66
+ print(f"Processing PDF with {num_pages} pages...")
67
+
68
+ extracted_text = []
69
+ total_chars = 0
70
+
71
+ for page_num in range(num_pages):
72
+ page = pdf_reader.pages[page_num]
73
+ text = page.extract_text() or ""
74
+
75
+ if total_chars + len(text) > self.max_chars:
76
+ remaining_chars = self.max_chars - total_chars
77
+ extracted_text.append(text[:remaining_chars])
78
+ print(f"Reached {self.max_chars} character limit at page {page_num + 1}")
79
+ break
80
+
81
+ extracted_text.append(text)
82
+ total_chars += len(text)
83
+ print(f"Processed page {page_num + 1}/{num_pages}")
84
+
85
+ final_text = '\n'.join(extracted_text)
86
+ print(f"Extraction complete! Total characters: {len(final_text)}")
87
+ return final_text
88
+
89
+ def create_word_bounded_chunks(self, text):
90
+ """Split text into chunks around the target size."""
91
+ words = text.split()
92
+ chunks = []
93
+ current_chunk = []
94
+ current_length = 0
95
+
96
+ for word in words:
97
+ word_length = len(word) + 1 # +1 for the space
98
+ if current_length + word_length > self.chunk_size and current_chunk:
99
+ chunks.append(' '.join(current_chunk))
100
+ current_chunk = [word]
101
+ current_length = word_length
102
+ else:
103
+ current_chunk.append(word)
104
+ current_length += word_length
105
+
106
+ if current_chunk:
107
+ chunks.append(' '.join(current_chunk))
108
+
109
+ return chunks
110
+
111
+ def process_chunk(self, text_chunk):
112
+ """Process a text chunk with the model and return the cleaned text."""
113
+ conversation = [
114
+ {"role": "system", "content": self.system_prompt},
115
+ {"role": "user", "content": text_chunk}
116
+ ]
117
+
118
+ prompt = self.tokenizer.apply_chat_template(conversation, tokenize=False)
119
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
120
+
121
+ with torch.no_grad():
122
+ output = self.model.generate(**inputs, temperature=0.7, top_p=0.9, max_new_tokens=512)
123
+
124
+ processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
125
+ return processed_text
126
+
127
+ def clean_and_save_text(self):
128
+ """Extract, clean, and save processed text to a file."""
129
+ extracted_text = self.extract_text()
130
+ if not extracted_text:
131
+ return None
132
+
133
+ chunks = self.create_word_bounded_chunks(extracted_text)
134
+ processed_text = ""
135
+
136
+ with open(self.output_path, 'w', encoding='utf-8') as out_file:
137
+ for chunk_num, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
138
+ processed_chunk = self.process_chunk(chunk)
139
+ processed_text += processed_chunk + "\n"
140
+ out_file.write(processed_chunk + "\n")
141
+ out_file.flush()
142
+
143
+ print(f"\nExtracted and cleaned text has been saved to {self.output_path}")
144
+ return self.output_path