Creatingdataset / app.py
Yoxas's picture
Update app.py
70637e5 verified
raw
history blame
4.22 kB
import os
import re
import pandas as pd
import PyPDF2
from concurrent.futures import ThreadPoolExecutor
from transformers import pipeline, AutoTokenizer
import gradio as gr
# Load the LED tokenizer and model
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
# Load the summarization model and tokenizer
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
return re.sub(r'[^a-zA-Z0-9\s]', '', text)
# Function to extract text from PDF files
def extract_text(pdf_file):
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
if pdf_reader.is_encrypted:
print(f"Skipping encrypted file: {pdf_file}")
return None
text = ''
for page in pdf_reader.pages:
text += page.extract_text() or ''
return text
except Exception as e:
print(f"Error extracting text from {pdf_file}: {e}")
return None
# Function to split text into chunks of a specified size
def split_text(text, chunk_size=1024):
words = text.split()
for i in range(0, len(words), chunk_size):
yield ' '.join(words[i:i + chunk_size])
# Function to classify text using LED model
def classify_text(text):
try:
return classifier(text)[0]['label']
except IndexError:
return "Unable to classify"
# Function to summarize text using the summarizer model
def summarize_text(text, max_length=100, min_length=30):
try:
return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
except IndexError:
return "Unable to summarize"
# Function to extract a title-like summary from the beginning of the text
def extract_title(text, max_length=20):
try:
return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
except IndexError:
return "Unable to extract title"
# Function to process each PDF file and extract relevant information
def process_pdf(pdf_file):
text = extract_text(pdf_file)
# Skip encrypted files
if text is None:
return None
# Extract a title from the beginning of the text
title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
title = extract_title(title_text)
# Initialize placeholders for combined results
combined_abstract = []
combined_cleaned_text = []
# Split text into chunks and process each chunk
for chunk in split_text(text, chunk_size=512):
# Summarize the text chunk
abstract = summarize_text(chunk)
combined_abstract.append(abstract)
# Clean the text chunk
cleaned_text = clean_text(chunk)
combined_cleaned_text.append(cleaned_text)
# Combine results from all chunks
final_abstract = ' '.join(combined_abstract)
final_cleaned_text = ' '.join(combined_cleaned_text)
return [title, final_abstract, final_cleaned_text]
# Function to handle multiple PDF files in parallel
def process_pdfs(files):
data = []
with ThreadPoolExecutor() as executor:
results = list(executor.map(process_pdf, files))
data.extend(result for result in results if result is not None)
return data
# Gradio interface function
def gradio_interface(files):
data = process_pdfs([file.name for file in files])
df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
csv_path = "/content/drive/My Drive/path_to_output/output.csv" # Adjust this to your actual path
df.to_csv(csv_path, index=False)
return csv_path
# Gradio app setup
gr.Interface(
fn=gradio_interface,
inputs=gr.inputs.File(file_count="multiple", file_types=[".pdf"]),
outputs="text",
title="PDF Research Paper Dataset Creator",
description="Upload PDF research papers to create a dataset with title, abstract, and content."
).launch()