""" Read papers from a PDF file and extract the title, abstract, figures and tables captions, and main content. These functions work best with ICLR / NeurIPS papers. """ from io import StringIO from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage def extract_text_from_pdf(path: str) -> str: """Extracts text from a PDF file. Args: path (str): A string specifying the path to the PDF file. Returns: A string containing the extracted text from the PDF. """ with open(path, 'rb') as file_handle: # Initialize a PDF resource manager to store shared resources. resource_manager = PDFResourceManager() # Set up a StringIO instance to capture the extracted text. text_output = StringIO() # Create a TextConverter to convert PDF pages to text. converter = TextConverter(resource_manager, text_output, laparams=LAParams()) # Initialize a PDF page interpreter. interpreter = PDFPageInterpreter(resource_manager, converter) # Process each page in the PDF. for page in PDFPage.get_pages(file_handle, caching=True, check_extractable=True): interpreter.process_page(page) # Retrieve the extracted text and close the StringIO instance. extracted_text = text_output.getvalue() text_output.close() # Finalize the converter. converter.close() # Replace form feed characters with newlines. extracted_text = extracted_text.replace('\x0c', '\n') return extracted_text def convert_text_into_dict(text: str) -> dict: """Converts the extracted text into a dictionary. Args: text (str): the extracted text from the PDF. Returns: A json object containing the extracted fields from the paper. """ lines = text.split('\n') # Create a filtered list to store non-matching lines filtered_lines = [line for line in lines if not (line.startswith('Under review') or line.startswith('Published as') or line.startswith('Paper under double-blind review'))] # Remove the first few empty lines before the title while filtered_lines[0].strip() == "": filtered_lines.pop(0) # Get title title = "" while filtered_lines[0] != "": title += filtered_lines.pop(0) + ' ' title = title.strip().capitalize() # Remove the author information between the title and the abstract while filtered_lines[0].lower() != "abstract": filtered_lines.pop(0) filtered_lines.pop(0) # Get abstract abstract = "" while filtered_lines[0].lower() != "introduction": abstract += filtered_lines.pop(0) + ' ' main_content = "" figures_captions = [] tables_captions = [] while filtered_lines != [] and not filtered_lines[0].lower().startswith("references"): figure_caption = "" table_caption = "" if filtered_lines[0].lower().startswith("figure"): while not filtered_lines[0] == "": figure_caption += filtered_lines.pop(0) + ' ' elif filtered_lines[0].lower().startswith("Table"): while not filtered_lines[0] == "": table_caption += filtered_lines.pop(0) + ' ' else: main_content += filtered_lines.pop(0) + ' ' if figure_caption != "": figures_captions.append(figure_caption) if table_caption != "": tables_captions.append(table_caption) figures_captions = "\n".join(figures_captions) + "\n" + "\n".join(tables_captions) # Get the first section title in the Appendix # Example section title: "A ENVIRONMENT DETAILS" while filtered_lines != [] and not (filtered_lines[0].isupper() and filtered_lines[0][0] == "A"): filtered_lines.pop(0) appendix = "" while filtered_lines != []: appendix += filtered_lines.pop(0) + ' ' # Now we have reached the "References" section # Skip until we reach paper = { "Title": title.strip(), "Abstract": abstract.strip(), "Figures/Tables Captions": figures_captions.strip(), "Main Content": main_content.strip(), "Appendix": appendix.strip(), } return paper if __name__ == "__main__": from agentreview.utility.authentication_utils import read_and_set_openai_key from agentreview.review import get_lm_review read_and_set_openai_key() path = "data/rejected/6359.pdf" text = extract_text_from_pdf(path) parsed_paper = convert_text_into_dict(text) review_generated = get_lm_review(parsed_paper) print(review_generated["review_generated"])