Spaces:
Running
Running
File size: 3,126 Bytes
6bfc69f 82d4481 6bfc69f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import gradio as gr
from PyPDF2 import PdfReader
import re
import spacy
from spacy.cli import download
download("en_core_web_sm")
import os
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
def pdf_to_txt(pdf_file_path, txt_file_path):
with open(pdf_file_path, "rb") as filehandle, open(txt_file_path, mode='w', encoding='UTF-8') as output:
pdf = PdfReader(filehandle)
for page_number, page in enumerate(pdf.pages):
print(f"Page: {page_number+1}", file=output)
print('', file=output)
print(page.extract_text(), file=output)
print('', file=output)
def clean_text(text):
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
return text
def chunk_text(text):
doc = nlp(text)
chunks = [str(sent) for sent in doc.sents]
return chunks
def lemmatize_chunk(chunk):
doc = nlp(chunk)
lemmatized_chunk = ' '.join([token.lemma_ for token in doc if token.lemma_ != ''])
return lemmatized_chunk
def process_large_pdf(file):
# Extract the file name without extension
file_name = file.name.split('.')[0]
# Convert PDF to text page by page
temp_txt_path = f"{file_name}_temp.txt"
with open(file.name, "rb") as filehandle, open(temp_txt_path, mode='w', encoding='UTF-8') as output:
pdf = PdfReader(filehandle)
for page_number, page in enumerate(pdf.pages):
print(f"Page: {page_number+1}", file=output)
print('', file=output)
print(page.extract_text(), file=output)
print('', file=output)
# Load the text file
with open(temp_txt_path, 'r') as file_txt:
text = file_txt.read()
# Clean the text
cleaned_text = clean_text(text)
# Save the cleaned text
cleaned_txt_path = f"{file_name}_cleaned.txt"
with open(cleaned_txt_path, 'w') as file_cleaned:
file_cleaned.write(cleaned_text)
# Chunk the text
chunks = chunk_text(cleaned_text)
# Lemmatize each chunk
lemmatized_chunks = [lemmatize_chunk(chunk) for chunk in chunks]
# Save the lemmatized chunks
lemmatized_chunks_path = f"{file_name}_lemmatized.txt"
with open(lemmatized_chunks_path, 'w') as file_lemmatized:
for chunk in lemmatized_chunks:
file_lemmatized.write(chunk + '\n')
# Remove the temporary text file
os.remove(temp_txt_path)
# Return the cleaned text file for download
return cleaned_txt_path
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# PDF Text Processing App")
with gr.Column():
file_obj = gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"])
submit_button = gr.Button("Process PDF")
output_file = gr.File(label="Download Cleaned Text File")
submit_button.click(
process_large_pdf,
inputs=file_obj,
outputs=output_file
)
if __name__ == "__main__":
demo.launch() |