Spaces:

Yoxas
/

Creatingdataset

Runtime error

App Files Files Community

Creatingdataset / app.py

Yoxas

Update app.py

70637e5 verified 12 months ago

raw

history blame

4.22 kB

	import os
	import re
	import pandas as pd
	import PyPDF2
	from concurrent.futures import ThreadPoolExecutor
	from transformers import pipeline, AutoTokenizer
	import gradio as gr

	# Load the LED tokenizer and model
	led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
	classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")

	# Load the summarization model and tokenizer
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")

	# Function to clean text by keeping only alphanumeric characters and spaces
	def clean_text(text):
	return re.sub(r'[^a-zA-Z0-9\s]', '', text)

	# Function to extract text from PDF files
	def extract_text(pdf_file):
	try:
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	if pdf_reader.is_encrypted:
	print(f"Skipping encrypted file: {pdf_file}")
	return None
	text = ''
	for page in pdf_reader.pages:
	text += page.extract_text() or ''
	return text
	except Exception as e:
	print(f"Error extracting text from {pdf_file}: {e}")
	return None

	# Function to split text into chunks of a specified size
	def split_text(text, chunk_size=1024):
	words = text.split()
	for i in range(0, len(words), chunk_size):
	yield ' '.join(words[i:i + chunk_size])

	# Function to classify text using LED model
	def classify_text(text):
	try:
	return classifier(text)[0]['label']
	except IndexError:
	return "Unable to classify"

	# Function to summarize text using the summarizer model
	def summarize_text(text, max_length=100, min_length=30):
	try:
	return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
	except IndexError:
	return "Unable to summarize"

	# Function to extract a title-like summary from the beginning of the text
	def extract_title(text, max_length=20):
	try:
	return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
	except IndexError:
	return "Unable to extract title"

	# Function to process each PDF file and extract relevant information
	def process_pdf(pdf_file):
	text = extract_text(pdf_file)

	# Skip encrypted files
	if text is None:
	return None

	# Extract a title from the beginning of the text
	title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
	title = extract_title(title_text)

	# Initialize placeholders for combined results
	combined_abstract = []
	combined_cleaned_text = []

	# Split text into chunks and process each chunk
	for chunk in split_text(text, chunk_size=512):
	# Summarize the text chunk
	abstract = summarize_text(chunk)
	combined_abstract.append(abstract)

	# Clean the text chunk
	cleaned_text = clean_text(chunk)
	combined_cleaned_text.append(cleaned_text)

	# Combine results from all chunks
	final_abstract = ' '.join(combined_abstract)
	final_cleaned_text = ' '.join(combined_cleaned_text)

	return [title, final_abstract, final_cleaned_text]

	# Function to handle multiple PDF files in parallel
	def process_pdfs(files):
	data = []
	with ThreadPoolExecutor() as executor:
	results = list(executor.map(process_pdf, files))
	data.extend(result for result in results if result is not None)
	return data

	# Gradio interface function
	def gradio_interface(files):
	data = process_pdfs([file.name for file in files])
	df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
	csv_path = "/content/drive/My Drive/path_to_output/output.csv" # Adjust this to your actual path
	df.to_csv(csv_path, index=False)
	return csv_path

	# Gradio app setup
	gr.Interface(
	fn=gradio_interface,
	inputs=gr.inputs.File(file_count="multiple", file_types=[".pdf"]),
	outputs="text",
	title="PDF Research Paper Dataset Creator",
	description="Upload PDF research papers to create a dataset with title, abstract, and content."
	).launch()