Shreyas094 commited on
Commit
8f71aa4
1 Parent(s): d06c0f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -6
app.py CHANGED
@@ -3,10 +3,15 @@ import gradio as gr
3
  from PyPDF2 import PdfReader
4
  import requests
5
  from dotenv import load_dotenv
 
6
  # Load environment variables
7
  load_dotenv()
8
  # Get the Hugging Face API token
9
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
 
 
 
10
  def summarize_text(text, instructions, agent_name):
11
  print(f"{agent_name}: Starting summarization")
12
  API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
@@ -19,7 +24,7 @@ def summarize_text(text, instructions, agent_name):
19
  response = requests.post(API_URL, headers=headers, json=payload)
20
  print(f"{agent_name}: Received response from API")
21
  return response.json()[0]["generated_text"]
22
- def process_pdf(pdf_file, chunk_instructions, final_instructions):
23
  print("Starting PDF processing")
24
  # Read PDF
25
  reader = PdfReader(pdf_file)
@@ -40,18 +45,40 @@ def process_pdf(pdf_file, chunk_instructions, final_instructions):
40
  # Concatenate Agent 1 summaries
41
  concatenated_summary = "\n\n".join(agent1_summaries)
42
  print(f"Concatenated Agent 1 summaries (length: {len(concatenated_summary)})")
43
- # Agent 2: Final summarization
44
- print("Agent 2: Starting final summarization")
45
- final_summary = summarize_text(concatenated_summary, final_instructions, "Agent 2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  print("Agent 2: Finished final summarization")
47
  return final_summary
48
- def pdf_summarizer(pdf_file, chunk_instructions, final_instructions):
49
  if pdf_file is None:
50
  print("Error: No PDF file uploaded")
51
  return "Please upload a PDF file."
52
  try:
53
  print(f"Starting summarization process for file: {pdf_file.name}")
54
- summary = process_pdf(pdf_file.name, chunk_instructions, final_instructions)
55
  print("Summarization process completed successfully")
56
  return summary
57
  except Exception as e:
@@ -63,6 +90,7 @@ iface = gr.Interface(
63
  inputs=[
64
  gr.File(label="Upload PDF"),
65
  gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
 
66
  gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
67
  ],
68
  outputs=gr.Textbox(label="Summary"),
 
3
  from PyPDF2 import PdfReader
4
  import requests
5
  from dotenv import load_dotenv
6
+ import tiktoken
7
  # Load environment variables
8
  load_dotenv()
9
  # Get the Hugging Face API token
10
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
11
+ # Initialize the tokenizer
12
+ tokenizer = tiktoken.get_encoding("cl100k_base")
13
+ def count_tokens(text):
14
+ return len(tokenizer.encode(text))
15
  def summarize_text(text, instructions, agent_name):
16
  print(f"{agent_name}: Starting summarization")
17
  API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
 
24
  response = requests.post(API_URL, headers=headers, json=payload)
25
  print(f"{agent_name}: Received response from API")
26
  return response.json()[0]["generated_text"]
27
+ def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions):
28
  print("Starting PDF processing")
29
  # Read PDF
30
  reader = PdfReader(pdf_file)
 
45
  # Concatenate Agent 1 summaries
46
  concatenated_summary = "\n\n".join(agent1_summaries)
47
  print(f"Concatenated Agent 1 summaries (length: {len(concatenated_summary)})")
48
+ print(f"Concatenated Summary:{concatenated_summary}")
49
+ # Sliding window approach
50
+ window_size = 3500 # in tokens
51
+ step_size = 3000 # overlap of 500 tokens
52
+ windows = []
53
+ current_position = 0
54
+ while current_position < len(concatenated_summary):
55
+ window_end = current_position
56
+ window_text = ""
57
+ while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
58
+ window_text += concatenated_summary[window_end]
59
+ window_end += 1
60
+ windows.append(window_text)
61
+ current_position += step_size
62
+ print(f"Created {len(windows)} windows for intermediate summarization")
63
+ # Intermediate summarization
64
+ intermediate_summaries = []
65
+ for i, window in enumerate(windows):
66
+ print(f"Processing window {i+1}/{len(windows)}")
67
+ summary = summarize_text(window, window_instructions, f"Window {i+1}")
68
+ intermediate_summaries.append(summary)
69
+ # Final summarization
70
+ final_input = "\n\n".join(intermediate_summaries)
71
+ print(f"Final input length: {count_tokens(final_input)} tokens")
72
+ final_summary = summarize_text(final_input, final_instructions, "Agent 2")
73
  print("Agent 2: Finished final summarization")
74
  return final_summary
75
+ def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions):
76
  if pdf_file is None:
77
  print("Error: No PDF file uploaded")
78
  return "Please upload a PDF file."
79
  try:
80
  print(f"Starting summarization process for file: {pdf_file.name}")
81
+ summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions)
82
  print("Summarization process completed successfully")
83
  return summary
84
  except Exception as e:
 
90
  inputs=[
91
  gr.File(label="Upload PDF"),
92
  gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
93
+ gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
94
  gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
95
  ],
96
  outputs=gr.Textbox(label="Summary"),