import gradio as gr import json import PyPDF2 import markdown import tempfile def convert_to_jsonl(file): content = "" if file.name.endswith('.txt'): content = file.read().decode('utf-8') elif file.name.endswith('.pdf'): pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: content += page.extract_text() elif file.name.endswith('.md'): content = file.read().decode('utf-8') content = markdown.markdown(content) lines = content.split('\n') jsonl_output = "" for i, line in enumerate(lines): if line.strip(): json_obj = {"id": i, "text": line.strip()} jsonl_output += json.dumps(json_obj) + "\n" # Save the JSONL output to a temporary file named tmp.jsonl temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl") temp_file.write(jsonl_output.encode('utf-8')) temp_file.close() return temp_file.name iface = gr.Interface( fn=convert_to_jsonl, inputs=gr.File(label="Upload .txt, .pdf, or .md file"), outputs=gr.File(label="Download JSONL file"), title="Raw Text to Indexed JSONL Converter", description="Upload a .txt, .pdf, or .md file to convert it to an indexed JSONL file for LLM training." ) if __name__ == "__main__": iface.launch()