Spaces:

wfranco
/

abstract-summary

Runtime error

App Files Files Community

wfranco commited on Dec 12, 2023

Commit

9bedcbe

1 Parent(s): 96f46f4

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -14

app.py CHANGED Viewed

@@ -1,18 +1,134 @@
 #!pip install gradio
-import gradio as gra
-def user_greeting(name):
-    return "Hi! " + name + " Welcome to your first Gradio application!😎"
-#define gradio interface and other parameters
-app =  gra.Interface(fn = user_greeting, inputs="text", outputs="text")
-app.launch()
-demo = gr.Interface(
-    fn=generate_audio_func,
-    inputs=input_component,
-    outputs=output_component,
-    title=app_name,
-    description=app_description
 )
-demo.launch()

 #!pip install gradio
+import gradio as gr
+def read_pdf(pdf_path):
+    # create a PDF file object
+    pdfFileObj = open(pdf_path, 'rb')
+    # create a PDF reader object
+    pdfReader = PyPDF2.PdfReader(pdfFileObj)
+    # Create the dictionary to extract text from each page
+    text_per_page = {}
+    # We extract the pages from the PDF
+    for pagenum, page in enumerate(extract_pages(pdf_path)):
+        # Initialize the variables needed for the text extraction from the page
+        pageObj = pdfReader.pages[pagenum]
+        page_text = []
+        line_format = []
+        text_from_images = []
+        text_from_tables = []
+        page_content = []
+        # Initialize the number of the examined tables
+        table_num = 0
+        first_element= True
+        table_extraction_flag= False
+        # Open the pdf file
+        pdf = pdfplumber.open(pdf_path)
+        # Find the examined page
+        page_tables = pdf.pages[pagenum]
+        # Find the number of tables on the page
+        tables = page_tables.find_tables()
+        # Find all the elements
+        page_elements = [(element.y1, element) for element in page._objs]
+        # Sort all the elements as they appear in the page
+        page_elements.sort(key=lambda a: a[0], reverse=True)
+        # Find the elements that composed a page
+        for i, component in enumerate(page_elements):
+            # Extract the position of the top side of the element in the PDF
+            pos = component[0]
+            # Extract the element of the page layout
+            element = component[1]
+            # Check if the element is a text element
+            if isinstance(element, LTTextContainer):
+                # Check if the text appeared in a table
+                if table_extraction_flag == False:
+                    # Use the function to extract the text and format for each text element
+                    (line_text, format_per_line) = text_extraction(element)
+                    # Append the text of each line to the page text
+                    page_text.append(line_text)
+                    # Append the format for each line containing text
+                    line_format.append(format_per_line)
+                    page_content.append(line_text)
+                else:
+                    # Omit the text that appeared in a table
+                    pass
+        # Create the key of the dictionary
+        dctkey = 'Page_'+str(pagenum)
+        # Add the list of list as the value of the page key
+        text_per_page[dctkey] = [page_text, line_format, text_from_images, text_from_tables, page_content]
+        # Closing the pdf file object
+        pdfFileObj.close()
+    return text_per_page
+pdf_path = '/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf'
+text_per_page = read_pdf(pdf_path)
+Page_0 = text_per_page['Page_0']
+def nested_list_to_string(nested_list):
+    result = ''
+    for element in nested_list:
+        if isinstance(element, list):  # Check if the element is a list
+            result += nested_list_to_string(element)  # Recursively process the list
+        elif isinstance(element, str):  # Check if the element is a string
+            result += element  # Append the string to the result
+    return result
+Page_0 = text_per_page['Page_0']
+string_result = nested_list_to_string(Page_0)
+def extract_abstract(page_0):
+    def nested_list_to_string(nested_list):
+        result = ''
+        for element in nested_list:
+            if isinstance(element, list):  # Check if the element is a list
+                result += nested_list_to_string(element)  # Recursively process the list
+            elif isinstance(element, str):  # Check if the element is a string
+                result += element  # Append the string to the result
+        return result
+    # Convert the nested list into a single string
+    full_text = nested_list_to_string(page_0)
+    # Find the start of the 'Abstract' section and the end of it (start of 'Introduction')
+    start_index = full_text.find('Abstract')
+    end_index = full_text.find('Introduction')
+    # If both 'Abstract' and 'Introduction' are found, extract the text in between
+    if start_index != -1 and end_index != -1:
+        # Extract the text and remove the word 'Abstract'
+        abstract_text = full_text[start_index + len('Abstract'):end_index]
+        return abstract_text.strip()
+    else:
+        return "Abstract or Introduction section not found."
+# Example usage
+Page_0 = text_per_page['Page_0']
+abstract_text = extract_abstract(Page_0)
+wall_of_text = abstract_text
+result = summarizer(
+    wall_of_text,
+    min_length=1,
+    max_length=30,
+    no_repeat_ngram_size=3,
+    encoder_no_repeat_ngram_size=3,
+    repetition_penalty=3.5,
+    num_beams=4,
+    early_stopping=True,
 )
+# Access the first element of the list (which is the dictionary) and then the value of 'summary_text'
+summary_string = result[0]['summary_text']
+print(summary_string)
+)
+app =  gra.Interface(fn = user_greeting, inputs=summary_string, outputs=summary_string)
+app.launch()