botchagalupe's picture
First Commit
f53a618
import gradio as gr
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Constants for default values
DEFAULT_CHUNK_SIZE = 100
DEFAULT_CHUNK_OVERLAP = 0
DEFAULT_NUM_CHUNKS = 10
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
"""
Tokenizes the input text based on the selected method and provided parameters.
"""
num_chunks = int(num_chunks)
output = []
# Ensure text is provided
if not text.strip():
return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
if method == "RecursiveCharacterTextSplitter":
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
tokenized_texts = text_splitter.split_text(text)[:num_chunks]
for i, chunk in enumerate(tokenized_texts):
output.append({
'Chunk #': i,
'Text Chunk': chunk,
'Character Count': len(chunk),
'Token Count': len(chunk.split())
})
df = pd.DataFrame(output)
return df
iface = gr.Interface(
fn=tokenize_text,
inputs=[
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS)
],
outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"], height=900,),
title="Text Tokenization Tool",
description="A tool for tokenizing text using different methods."
)
iface.launch()