nbdev_refactor / MediaVectorStores.py
Ilayda-j's picture
Upload 7 files
dd3611a
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/media_stores.ipynb.
# %% auto 0
__all__ = ['rawtext_to_doc_split', 'files_to_text', 'youtube_to_text', 'save_text', 'get_youtube_transcript',
'website_to_text_web', 'website_to_text_unstructured', 'get_document_segments', 'create_local_vector_store']
# %% ../nbs/media_stores.ipynb 3
# import libraries here
import os
import itertools
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain.document_loaders import WebBaseLoader, UnstructuredURLLoader
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQAWithSourcesChain
# %% ../nbs/media_stores.ipynb 8
def rawtext_to_doc_split(text, chunk_size=1500, chunk_overlap=150):
# Quick type checking
if not isinstance(text, list):
text = [text]
# Create splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
add_start_index = True)
#Split into docs segments
if isinstance(text[0], Document):
doc_segments = text_splitter.split_documents(text)
else:
doc_segments = text_splitter.split_documents(text_splitter.create_documents(text))
# Make into one big list
doc_segments = list(itertools.chain(*doc_segments)) if isinstance(doc_segments[0], list) else doc_segments
return doc_segments
# %% ../nbs/media_stores.ipynb 16
## A single File
def _file_to_text(single_file, chunk_size = 1000, chunk_overlap=150):
# Create loader and get segments
loader = UnstructuredFileLoader(single_file)
doc_segments = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
add_start_index=True))
return doc_segments
## Multiple files
def files_to_text(files_list, chunk_size=1000, chunk_overlap=150):
# Quick type checking
if not isinstance(files_list, list):
files_list = [files_list]
# This is currently a fix because the UnstructuredFileLoader expects a list of files yet can't split them correctly yet
all_segments = [_file_to_text(single_file, chunk_size=chunk_size, chunk_overlap=chunk_overlap) for single_file in files_list]
all_segments = list(itertools.chain(*all_segments)) if isinstance(all_segments[0], list) else all_segments
return all_segments
# %% ../nbs/media_stores.ipynb 20
def youtube_to_text(urls, save_dir = "content"):
# Transcribe the videos to text
# save_dir: directory to save audio files
if not isinstance(urls, list):
urls = [urls]
youtube_loader = GenericLoader(YoutubeAudioLoader(urls, save_dir), OpenAIWhisperParser())
youtube_docs = youtube_loader.load()
return youtube_docs
# %% ../nbs/media_stores.ipynb 24
def save_text(text, text_name = None):
if not text_name:
text_name = text[:20]
text_path = os.path.join("/content",text_name+".txt")
with open(text_path, "x") as f:
f.write(text)
# Return the location at which the transcript is saved
return text_path
# %% ../nbs/media_stores.ipynb 25
def get_youtube_transcript(yt_url, save_transcript = False, temp_audio_dir = "sample_data"):
# Transcribe the videos to text and save to file in /content
# save_dir: directory to save audio files
youtube_docs = youtube_to_text(yt_url, save_dir = temp_audio_dir)
# Combine doc
combined_docs = [doc.page_content for doc in youtube_docs]
combined_text = " ".join(combined_docs)
# Save text to file
video_path = youtube_docs[0].metadata["source"]
youtube_name = os.path.splitext(os.path.basename(video_path))[0]
save_path = None
if save_transcript:
save_path = save_text(combined_text, youtube_name)
return youtube_docs, save_path
# %% ../nbs/media_stores.ipynb 27
def website_to_text_web(url, chunk_size = 1500, chunk_overlap=100):
# Url can be a single string or list
website_loader = WebBaseLoader(url)
website_raw = website_loader.load()
website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap)
# Combine doc
return website_data
# %% ../nbs/media_stores.ipynb 33
def website_to_text_unstructured(web_urls, chunk_size = 1500, chunk_overlap=100):
# Make sure it's a list
if not isinstance(web_urls, list):
web_urls = [web_urls]
# Url can be a single string or list
website_loader = UnstructuredURLLoader(web_urls)
website_raw = website_loader.load()
website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap)
# Return individual docs or list
return website_data
# %% ../nbs/media_stores.ipynb 45
def get_document_segments(context_info, data_type, chunk_size = 1500, chunk_overlap=100):
load_fcn = None
addtnl_params = {'chunk_size': chunk_size, 'chunk_overlap': chunk_overlap}
# Define function use to do the loading
if data_type == 'text':
load_fcn = rawtext_to_doc_split
elif data_type == 'web_page':
load_fcn = website_to_text_unstructured
elif data_type == 'youtube_video':
load_fcn = youtube_to_text
else:
load_fcn = files_to_text
# Get the document segments
doc_segments = load_fcn(context_info, **addtnl_params)
return doc_segments
# %% ../nbs/media_stores.ipynb 47
def create_local_vector_store(document_segments, **retriever_kwargs):
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(document_segments, embeddings)
retriever = db.as_retriever(**retriever_kwargs)
return db, retriever