TokenCounter / app.py
Almaatla's picture
Update app.py
14a6f5b
raw
history blame
3.74 kB
import os
import zipfile
import gradio as gr
from PyPDF4 import PdfFileReader
import tiktoken
def extract_text_from_pdf(file_path):
with open(file_path, "rb") as file:
pdf = PdfFileReader(file)
text = ""
for page_num in range(pdf.getNumPages()):
text += pdf.getPage(page_num).extractText()
return text
def tokenize(text, model="gpt-3.5-turbo"):
tokenizer = tiktoken.encoding_for_model(model)
tokens = tokenizer.encode(text, disallowed_special=())
return tokens
def count_tokens(text):
return len(tokenize(text))
def analyse_text(text):
num_tokens = count_tokens(text)
result = []
try:
result.append(f"Text length: {len(text)}")
result.append(f"Token counts: {num_tokens}")
result.append(f"Char per token: {'%.1f' % (len(text)/num_tokens)}")
except:
result = 'no text'
return '\n'.join(result)
def analyse_file(file):
paper_text = extract_text_from_pdf(file.name)
return paper_text
def write_chunks_to_files(chunks):
file_paths = []
for i, chunk in enumerate(chunks, start=1):
file_path = f"chunk_{i}.txt"
with open(file_path, "w") as file:
file.write(chunk)
file_paths.append(file_path)
return file_paths
def write_chunks_to_zip(chunks):
file_paths = write_chunks_to_files(chunks)
zip_file_name = "chunks.zip"
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
for file in file_paths:
zipf.write(file)
os.remove(file) # Remove the file after writing it into the zip
return zip_file_name
def chunk_text(text, max_char, overlap):
chunks = []
start = 0
end = max_char
while start < len(text):
if end >= len(text):
end = len(text)
chunk = text[start:end]
num_tokens = count_tokens(chunk)
chunks.append((chunk, len(chunk), num_tokens))
start += max_char - overlap
end = start + max_char
return chunks
def chunk_file(file, max_char, overlap):
text = extract_text_from_pdf(file.name)
chunks = chunk_text(text, max_char, overlap)
formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
return '\n'.join(formatted_chunks), zip_file_path
def chunk_and_zip_text(text, max_char, overlap):
chunks = chunk_text(text, max_char, overlap)
formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
return '\n'.join(formatted_chunks), zip_file_path
with gr.Blocks() as demo:
docs_input = gr.File(file_count="single", file_types=[".pdf"])
text_to_chunk = gr.Textbox(label='Text to chunk',show_copy_button=True)
tb_analysis = gr.Textbox(label='Text Analysis')
sl_max_char_per_chunk = gr.Slider(1000, 300000, value=10000, label="Number of characters", info="Choose a number of characters per chunk")
sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
btn_chunk = gr.Button("Chunk text")
tb_chunked_text = gr.Textbox(label='Chunks Info')
download_link = gr.File(label='Download Chunks')
# Call analyse_file when a file is uploaded and display the results in tb_analysis
docs_input.upload(analyse_file,inputs=[docs_input], outputs=[text_to_chunk])
text_to_chunk.change(analyse_text,inputs=[text_to_chunk],outputs=[tb_analysis])
btn_chunk.click(chunk_and_zip_text, inputs=[text_to_chunk, sl_max_char_per_chunk, sl_overlap], outputs=[tb_chunked_text, download_link])
demo.launch(debug=True, share=False)