import gradio as gr import pandas as pd from langchain.text_splitter import RecursiveCharacterTextSplitter # Constants for default values DEFAULT_CHUNK_SIZE = 100 DEFAULT_CHUNK_OVERLAP = 0 DEFAULT_NUM_CHUNKS = 10 def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks): """ Tokenizes the input text based on the selected method and provided parameters. """ num_chunks = int(num_chunks) output = [] # Ensure text is provided if not text.strip(): return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count']) if method == "RecursiveCharacterTextSplitter": text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False) tokenized_texts = text_splitter.split_text(text)[:num_chunks] for i, chunk in enumerate(tokenized_texts): output.append({ 'Chunk #': i, 'Text Chunk': chunk, 'Character Count': len(chunk), 'Token Count': len(chunk.split()) }) df = pd.DataFrame(output) return df iface = gr.Interface( fn=tokenize_text, inputs=[ gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]), gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."), gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE), gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP), gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS) ], outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"], height=900,), title="Text Tokenization Tool", description="A tool for tokenizing text using different methods." ) iface.launch()