Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
import PyPDF2 | |
import markdown | |
import tempfile | |
def convert_to_jsonl(file): | |
content = "" | |
if file.name.endswith('.txt'): | |
content = file.read().decode('utf-8') | |
elif file.name.endswith('.pdf'): | |
pdf_reader = PyPDF2.PdfReader(file) | |
for page in pdf_reader.pages: | |
content += page.extract_text() | |
elif file.name.endswith('.md'): | |
content = file.read().decode('utf-8') | |
content = markdown.markdown(content) | |
lines = content.split('\n') | |
jsonl_output = "" | |
for i, line in enumerate(lines): | |
if line.strip(): | |
json_obj = {"id": i, "text": line.strip()} | |
jsonl_output += json.dumps(json_obj) + "\n" | |
# Save the JSONL output to a temporary file named tmp.jsonl | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl") | |
temp_file.write(jsonl_output.encode('utf-8')) | |
temp_file.close() | |
return temp_file.name | |
iface = gr.Interface( | |
fn=convert_to_jsonl, | |
inputs=gr.File(label="Upload .txt, .pdf, or .md file"), | |
outputs=gr.File(label="Download JSONL file"), | |
title="Raw Text to Indexed JSONL Converter", | |
description="Upload a .txt, .pdf, or .md file to convert it to an indexed JSONL file for LLM training." | |
) | |
if __name__ == "__main__": | |
iface.launch() |