text2jsonl / app.py
namelessai's picture
use tempfile
a3129cb verified
import gradio as gr
import json
import PyPDF2
import markdown
import tempfile
def convert_to_jsonl(file):
content = ""
if file.name.endswith('.txt'):
content = file.read().decode('utf-8')
elif file.name.endswith('.pdf'):
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
content += page.extract_text()
elif file.name.endswith('.md'):
content = file.read().decode('utf-8')
content = markdown.markdown(content)
lines = content.split('\n')
jsonl_output = ""
for i, line in enumerate(lines):
if line.strip():
json_obj = {"id": i, "text": line.strip()}
jsonl_output += json.dumps(json_obj) + "\n"
# Save the JSONL output to a temporary file named tmp.jsonl
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl")
temp_file.write(jsonl_output.encode('utf-8'))
temp_file.close()
return temp_file.name
iface = gr.Interface(
fn=convert_to_jsonl,
inputs=gr.File(label="Upload .txt, .pdf, or .md file"),
outputs=gr.File(label="Download JSONL file"),
title="Raw Text to Indexed JSONL Converter",
description="Upload a .txt, .pdf, or .md file to convert it to an indexed JSONL file for LLM training."
)
if __name__ == "__main__":
iface.launch()