{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(14,\n", " ['sukanta.txt',\n", " 'jatindramohan.txt',\n", " 'sukumar.txt',\n", " 'ishwarchandragupta.txt',\n", " 'test.txt',\n", " 'data_prep.ipynb',\n", " 'train.txt',\n", " 'madhusudan.txt',\n", " 'satyendranath.txt',\n", " 'rabindranath.txt',\n", " 'jibanananda.txt',\n", " '.ipynb_checkpoints',\n", " 'nazrulislam.txt',\n", " 'kaminiroy.txt'])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "files = os.listdir('.')\n", "len(files), files" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(['sukanta.txt',\n", " 'jatindramohan.txt',\n", " 'sukumar.txt',\n", " 'ishwarchandragupta.txt',\n", " 'madhusudan.txt',\n", " 'satyendranath.txt',\n", " 'rabindranath.txt',\n", " 'jibanananda.txt',\n", " 'nazrulislam.txt',\n", " 'kaminiroy.txt'],\n", " 10)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "poems_files = files[0:4] + files[7:11] + files[12:14]\n", "poems_files, len(poems_files)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def open_files(dr):\n", " return (open(file, 'r') for file in os.listdir(dr) if file in poems_files)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "sukanta, jm, sukumar, ishwarg, msd, satyen, rabi, jiban, nazrul, kamini = open_files('.')\n", "poets = [sukanta, jm, sukumar, ishwarg, msd, satyen, rabi, jiban, nazrul, kamini]\n", "\n", "# for line in poets[0].readlines():\n", "# print(line)\n", "\n", "for poet in poets:\n", " counter = 0\n", " lines = poet.readlines()\n", " line_count = len(lines)\n", " train_split_idx = int(line_count * 0.9)\n", " for line in lines:\n", " if counter <= train_split_idx:\n", " with open(r'train.txt', 'a') as train_:\n", " train_.writelines(line)\n", " counter += 1\n", " else:\n", " with open(r'test.txt', 'a') as test_:\n", " test_.writelines(line)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of lines in train file: 91263\n", "Number of lines in test file: 10136\n" ] } ], "source": [ "with open(r'train.txt', 'r') as train:\n", " train_line_count = len(train.readlines())\n", " print(f'Number of lines in train file: {train_line_count}')\n", " \n", "with open(r'test.txt', 'r') as test:\n", " test_line_count = len(test.readlines())\n", " print(f'Number of lines in test file: {test_line_count}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 2 }