File size: 549 Bytes
c9bf686
 
 
 
8f228b9
c9bf686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pickle
import os
import shutil
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader

!git clone https://github.com/TheMITTech/shakespeare

from glob import glob
files = glob("./shakespeare/**/*.html")

os.mkdir('./data')
destination_folder = './data/'

for html_file in files:
    shutil.move(html_file, destination_folder + html_file.split("/"[-1]))

bshtml_dir_loader = DirectoryLoader('./data/', loader_cls = BSHTMLLoader)

data = bshtml_dir_loader.load()

with open("shakespeare.pkl", "wb") as fp:
  pickle.dump(data, fp)