Spaces:
Runtime error
Runtime error
import os | |
import zipfile | |
import gradio as gr | |
from PyPDF4 import PdfFileReader | |
import tiktoken | |
def extract_text_from_pdf(file_path): | |
with open(file_path, "rb") as file: | |
pdf = PdfFileReader(file) | |
text = "" | |
for page_num in range(pdf.getNumPages()): | |
text += pdf.getPage(page_num).extractText() | |
return text | |
def tokenize(text, model="gpt-3.5-turbo"): | |
tokenizer = tiktoken.encoding_for_model(model) | |
tokens = tokenizer.encode(text, disallowed_special=()) | |
return tokens | |
def count_tokens(text): | |
return len(tokenize(text)) | |
def analyse_text(text): | |
num_tokens = count_tokens(text) | |
result = [] | |
try: | |
result.append(f"Text length: {len(text)}") | |
result.append(f"Token counts: {num_tokens}") | |
result.append(f"Char per token: {'%.1f' % (len(text)/num_tokens)}") | |
except: | |
result = 'no text' | |
return '\n'.join(result) | |
def analyse_file(file): | |
paper_text = extract_text_from_pdf(file.name) | |
return paper_text | |
def write_chunks_to_files(chunks): | |
file_paths = [] | |
for i, chunk in enumerate(chunks, start=1): | |
file_path = f"chunk_{i}.txt" | |
with open(file_path, "w") as file: | |
file.write(chunk) | |
file_paths.append(file_path) | |
return file_paths | |
def write_chunks_to_zip(chunks): | |
file_paths = write_chunks_to_files(chunks) | |
zip_file_name = "chunks.zip" | |
with zipfile.ZipFile(zip_file_name, 'w') as zipf: | |
for file in file_paths: | |
zipf.write(file) | |
os.remove(file) # Remove the file after writing it into the zip | |
return zip_file_name | |
def chunk_text(text, max_char, overlap): | |
chunks = [] | |
start = 0 | |
end = max_char | |
while start < len(text): | |
if end >= len(text): | |
end = len(text) | |
chunk = text[start:end] | |
num_tokens = count_tokens(chunk) | |
chunks.append((chunk, len(chunk), num_tokens)) | |
start += max_char - overlap | |
end = start + max_char | |
return chunks | |
def chunk_file(file, max_char, overlap): | |
text = extract_text_from_pdf(file.name) | |
chunks = chunk_text(text, max_char, overlap) | |
formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)] | |
zip_file_path = write_chunks_to_zip([c[0] for c in chunks]) | |
return '\n'.join(formatted_chunks), zip_file_path | |
def chunk_and_zip_text(text, max_char, overlap): | |
chunks = chunk_text(text, max_char, overlap) | |
formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)] | |
zip_file_path = write_chunks_to_zip([c[0] for c in chunks]) | |
return '\n'.join(formatted_chunks), zip_file_path | |
with gr.Blocks() as demo: | |
docs_input = gr.File(file_count="single", file_types=[".pdf"]) | |
text_to_chunk = gr.Textbox(label='Text to chunk',show_copy_button=True) | |
tb_analysis = gr.Textbox(label='Text Analysis') | |
sl_max_char_per_chunk = gr.Slider(1000, 300000, value=10000, label="Number of characters", info="Choose a number of characters per chunk") | |
sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size") | |
btn_chunk = gr.Button("Chunk text") | |
tb_chunked_text = gr.Textbox(label='Chunks Info') | |
download_link = gr.File(label='Download Chunks') | |
# Call analyse_file when a file is uploaded and display the results in tb_analysis | |
docs_input.upload(analyse_file,inputs=[docs_input], outputs=[text_to_chunk]) | |
text_to_chunk.change(analyse_text,inputs=[text_to_chunk],outputs=[tb_analysis]) | |
btn_chunk.click(chunk_and_zip_text, inputs=[text_to_chunk, sl_max_char_per_chunk, sl_overlap], outputs=[tb_chunked_text, download_link]) | |
demo.launch(debug=True, share=False) | |