File size: 3,319 Bytes
868fea5
 
 
 
 
 
874bc90
868fea5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
from groq import Groq
import gradio as gr
from PyPDF2 import PdfReader 
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

os.environ['GROQ_API_KEY'] = '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'
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

def summarize(file, prompt):

    #preprocess PDF file
    #extracting text from all pages
    reader = PdfReader(file)
    num_pages = len(reader.pages)

    all_text = []

    #extract text from the first three pages
    for page_number in range(min(3, num_pages)):
        page = reader.pages[page_number]
        text = page.extract_text()
        all_text.append(text)

    #extract text from the last three pages
    for page_number in range(max(0, num_pages - 3), num_pages):
        page = reader.pages[page_number]
        text = page.extract_text()
        all_text.append(text)
        
    full_text = ' '.join(all_text)
    
    #tokenization
    tokens = full_text.split()

    #removing punctuation after tokenization
    cleaned_tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]

    #removing empty tokens
    cleaned_tokens = [token for token in cleaned_tokens if token]
    
    #removing stopwords
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [token for token in cleaned_tokens if token.lower() not in stop_words]
    
    #concatenate prompt and text
    input_text = prompt + " " + full_text

    #summarize using groq model
    chat_completion = client.chat.completions.create(
    messages=[{
        "role": "user",
        "content": input_text}],
        model="Mixtral-8x7b-32768")

    summarize = chat_completion.choices[0].message.content
    return summarize

#have to define prompt box first to have it under the examples
prompt = gr.Textbox(placeholder="Pick one of the examples or type your prompt...", label = "Prompt Input", lines=8)

with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple")) as iface:
    #place to upload files; I have it to only show pdfs files, but someone could still upload non-PDF files it would just send back an error
    file = gr.File(label="Upload PDF", file_types=["pdf"])

    #box for the output
    sum_box = gr.Textbox(placeholder="Your summary will appear here...", label = "Summary Output", lines=8, interactive= False)

    #examples for users to use, when clicked it fills in the prompt textbox
    examples = gr.Examples(examples=[
        "Write a two-paragraph summary of this PDF document, emphasizing the key points and conclusions"
      , "Write a one-paragraph summary of the key findings or arguments presented in this PDF"
      , "Provide a bullet-point outline of the key insights from this PDF"
      , "Write a summary tweet (280 characters) based on the main points of this PDF"], inputs=[prompt])

    #showing prompt textbox and making button to submit
    prompt.render()
    send = gr.Button("Send")

    #I could not figure out how to align the github link to the right and it bugs me
    with gr.Row():
        gr.Markdown("Made by Olivia VonCanon")
        link = "[View on Github](https://github.com/Liv6)"
        gr.Markdown(link)

    #calling the function if user pushes send button
    send.click(fn=summarize, inputs=[file, prompt], outputs=sum_box)
    
iface.launch()