Spaces:

ChrisSacrumCor
/

pdf_to_clean_txt

Running

File size: 3,126 Bytes

6bfc69f
 
 
 
82d4481
 
6bfc69f

import gradio as gr
from PyPDF2 import PdfReader
import re
import spacy
from spacy.cli import download
download("en_core_web_sm")
import os

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def pdf_to_txt(pdf_file_path, txt_file_path):
    with open(pdf_file_path, "rb") as filehandle, open(txt_file_path, mode='w', encoding='UTF-8') as output:
        pdf = PdfReader(filehandle)
        
        for page_number, page in enumerate(pdf.pages):
            print(f"Page: {page_number+1}", file=output)
            print('', file=output)
            print(page.extract_text(), file=output)
            print('', file=output)

def clean_text(text):
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def chunk_text(text):
    doc = nlp(text)
    chunks = [str(sent) for sent in doc.sents]
    return chunks

def lemmatize_chunk(chunk):
    doc = nlp(chunk)
    lemmatized_chunk = ' '.join([token.lemma_ for token in doc if token.lemma_ != ''])
    return lemmatized_chunk

def process_large_pdf(file):
    # Extract the file name without extension
    file_name = file.name.split('.')[0]
    
    # Convert PDF to text page by page
    temp_txt_path = f"{file_name}_temp.txt"
    with open(file.name, "rb") as filehandle, open(temp_txt_path, mode='w', encoding='UTF-8') as output:
        pdf = PdfReader(filehandle)
        
        for page_number, page in enumerate(pdf.pages):
            print(f"Page: {page_number+1}", file=output)
            print('', file=output)
            print(page.extract_text(), file=output)
            print('', file=output)
            
            
    # Load the text file
    with open(temp_txt_path, 'r') as file_txt:
        text = file_txt.read()
    
    # Clean the text
    cleaned_text = clean_text(text)
    
    # Save the cleaned text
    cleaned_txt_path = f"{file_name}_cleaned.txt"
    with open(cleaned_txt_path, 'w') as file_cleaned:
        file_cleaned.write(cleaned_text)
    
    # Chunk the text
    chunks = chunk_text(cleaned_text)
    
    # Lemmatize each chunk
    lemmatized_chunks = [lemmatize_chunk(chunk) for chunk in chunks]
    
    # Save the lemmatized chunks
    lemmatized_chunks_path = f"{file_name}_lemmatized.txt"
    with open(lemmatized_chunks_path, 'w') as file_lemmatized:
        for chunk in lemmatized_chunks:
            file_lemmatized.write(chunk + '\n')
    
    # Remove the temporary text file
    os.remove(temp_txt_path)
    
    # Return the cleaned text file for download
    return cleaned_txt_path

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# PDF Text Processing App")
    
    with gr.Column():
        file_obj = gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"])
        submit_button = gr.Button("Process PDF")
        output_file = gr.File(label="Download Cleaned Text File")
        
        submit_button.click(
            process_large_pdf,
            inputs=file_obj,
            outputs=output_file
        )

if __name__ == "__main__":
    demo.launch()