Spaces:
Runtime error
Runtime error
| import os | |
| import zipfile | |
| import gradio as gr | |
| from PyPDF4 import PdfFileReader | |
| import tiktoken | |
| def extract_text_from_pdf(file_path): | |
| with open(file_path, "rb") as file: | |
| pdf = PdfFileReader(file) | |
| text = "" | |
| for page_num in range(pdf.getNumPages()): | |
| text += pdf.getPage(page_num).extractText() | |
| return text | |
| def tokenize(text, model="gpt-3.5-turbo"): | |
| tokenizer = tiktoken.encoding_for_model(model) | |
| tokens = tokenizer.encode(text, disallowed_special=()) | |
| return tokens | |
| def count_tokens(text): | |
| return len(tokenize(text)) | |
| def analyse_text(text): | |
| num_tokens = count_tokens(text) | |
| result = [] | |
| try: | |
| result.append(f"Text length: {len(text)}") | |
| result.append(f"Token counts: {num_tokens}") | |
| result.append(f"Char per token: {'%.1f' % (len(text)/num_tokens)}") | |
| except: | |
| result = 'no text' | |
| return '\n'.join(result) | |
| def analyse_file(file): | |
| paper_text = extract_text_from_pdf(file.name) | |
| return paper_text | |
| def write_chunks_to_files(chunks): | |
| file_paths = [] | |
| for i, chunk in enumerate(chunks, start=1): | |
| file_path = f"chunk_{i}.txt" | |
| with open(file_path, "w") as file: | |
| file.write(chunk) | |
| file_paths.append(file_path) | |
| return file_paths | |
| def write_chunks_to_zip(chunks): | |
| file_paths = write_chunks_to_files(chunks) | |
| zip_file_name = "chunks.zip" | |
| with zipfile.ZipFile(zip_file_name, 'w') as zipf: | |
| for file in file_paths: | |
| zipf.write(file) | |
| os.remove(file) # Remove the file after writing it into the zip | |
| return zip_file_name | |
| def chunk_text(text, max_char, overlap): | |
| chunks = [] | |
| start = 0 | |
| end = max_char | |
| while start < len(text): | |
| if end >= len(text): | |
| end = len(text) | |
| chunk = text[start:end] | |
| num_tokens = count_tokens(chunk) | |
| chunks.append((chunk, len(chunk), num_tokens)) | |
| start += max_char - overlap | |
| end = start + max_char | |
| return chunks | |
| def chunk_file(file, max_char, overlap): | |
| text = extract_text_from_pdf(file.name) | |
| chunks = chunk_text(text, max_char, overlap) | |
| formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)] | |
| zip_file_path = write_chunks_to_zip([c[0] for c in chunks]) | |
| return '\n'.join(formatted_chunks), zip_file_path | |
| def chunk_and_zip_text(text, max_char, overlap): | |
| chunks = chunk_text(text, max_char, overlap) | |
| formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)] | |
| zip_file_path = write_chunks_to_zip([c[0] for c in chunks]) | |
| return '\n'.join(formatted_chunks), zip_file_path | |
| with gr.Blocks() as demo: | |
| docs_input = gr.File(file_count="single", file_types=[".pdf"]) | |
| text_to_chunk = gr.Textbox(label='Text to chunk',show_copy_button=True) | |
| tb_analysis = gr.Textbox(label='Text Analysis') | |
| sl_max_char_per_chunk = gr.Slider(1000, 300000, value=10000, label="Number of characters", info="Choose a number of characters per chunk") | |
| sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size") | |
| btn_chunk = gr.Button("Chunk text") | |
| tb_chunked_text = gr.Textbox(label='Chunks Info') | |
| download_link = gr.File(label='Download Chunks') | |
| # Call analyse_file when a file is uploaded and display the results in tb_analysis | |
| docs_input.upload(analyse_file,inputs=[docs_input], outputs=[text_to_chunk]) | |
| text_to_chunk.change(analyse_text,inputs=[text_to_chunk],outputs=[tb_analysis]) | |
| btn_chunk.click(chunk_and_zip_text, inputs=[text_to_chunk, sl_max_char_per_chunk, sl_overlap], outputs=[tb_chunked_text, download_link]) | |
| demo.launch(debug=True, share=False) | |