Spaces:

Ilayda-j
/

nbdev_refactor

Sleeping

App Files Files Community

nbdev_refactor / MediaVectorStores.py

Ilayda-j

Upload 7 files

dd3611a almost 2 years ago

raw

history blame contribute delete

6.14 kB

	# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/media_stores.ipynb.

	# %% auto 0
	__all__ = ['rawtext_to_doc_split', 'files_to_text', 'youtube_to_text', 'save_text', 'get_youtube_transcript',
	'website_to_text_web', 'website_to_text_unstructured', 'get_document_segments', 'create_local_vector_store']

	# %% ../nbs/media_stores.ipynb 3
	# import libraries here
	import os
	import itertools

	from langchain.embeddings import OpenAIEmbeddings

	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders.unstructured import UnstructuredFileLoader
	from langchain.document_loaders.generic import GenericLoader
	from langchain.document_loaders.parsers import OpenAIWhisperParser
	from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
	from langchain.document_loaders import WebBaseLoader, UnstructuredURLLoader
	from langchain.docstore.document import Document

	from langchain.vectorstores import Chroma
	from langchain.chains import RetrievalQAWithSourcesChain

	# %% ../nbs/media_stores.ipynb 8
	def rawtext_to_doc_split(text, chunk_size=1500, chunk_overlap=150):

	# Quick type checking
	if not isinstance(text, list):
	text = [text]

	# Create splitter
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	add_start_index = True)

	#Split into docs segments
	if isinstance(text[0], Document):
	doc_segments = text_splitter.split_documents(text)
	else:
	doc_segments = text_splitter.split_documents(text_splitter.create_documents(text))

	# Make into one big list
	doc_segments = list(itertools.chain(*doc_segments)) if isinstance(doc_segments[0], list) else doc_segments

	return doc_segments

	# %% ../nbs/media_stores.ipynb 16
	## A single File
	def _file_to_text(single_file, chunk_size = 1000, chunk_overlap=150):

	# Create loader and get segments
	loader = UnstructuredFileLoader(single_file)
	doc_segments = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	add_start_index=True))
	return doc_segments


	## Multiple files
	def files_to_text(files_list, chunk_size=1000, chunk_overlap=150):

	# Quick type checking
	if not isinstance(files_list, list):
	files_list = [files_list]

	# This is currently a fix because the UnstructuredFileLoader expects a list of files yet can't split them correctly yet
	all_segments = [_file_to_text(single_file, chunk_size=chunk_size, chunk_overlap=chunk_overlap) for single_file in files_list]
	all_segments = list(itertools.chain(*all_segments)) if isinstance(all_segments[0], list) else all_segments

	return all_segments

	# %% ../nbs/media_stores.ipynb 20
	def youtube_to_text(urls, save_dir = "content"):
	# Transcribe the videos to text
	# save_dir: directory to save audio files

	if not isinstance(urls, list):
	urls = [urls]

	youtube_loader = GenericLoader(YoutubeAudioLoader(urls, save_dir), OpenAIWhisperParser())
	youtube_docs = youtube_loader.load()

	return youtube_docs

	# %% ../nbs/media_stores.ipynb 24
	def save_text(text, text_name = None):
	if not text_name:
	text_name = text[:20]
	text_path = os.path.join("/content",text_name+".txt")

	with open(text_path, "x") as f:
	f.write(text)
	# Return the location at which the transcript is saved
	return text_path

	# %% ../nbs/media_stores.ipynb 25
	def get_youtube_transcript(yt_url, save_transcript = False, temp_audio_dir = "sample_data"):
	# Transcribe the videos to text and save to file in /content
	# save_dir: directory to save audio files

	youtube_docs = youtube_to_text(yt_url, save_dir = temp_audio_dir)

	# Combine doc
	combined_docs = [doc.page_content for doc in youtube_docs]
	combined_text = " ".join(combined_docs)

	# Save text to file
	video_path = youtube_docs[0].metadata["source"]
	youtube_name = os.path.splitext(os.path.basename(video_path))[0]

	save_path = None
	if save_transcript:
	save_path = save_text(combined_text, youtube_name)

	return youtube_docs, save_path

	# %% ../nbs/media_stores.ipynb 27
	def website_to_text_web(url, chunk_size = 1500, chunk_overlap=100):

	# Url can be a single string or list
	website_loader = WebBaseLoader(url)
	website_raw = website_loader.load()

	website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap)

	# Combine doc
	return website_data

	# %% ../nbs/media_stores.ipynb 33
	def website_to_text_unstructured(web_urls, chunk_size = 1500, chunk_overlap=100):

	# Make sure it's a list
	if not isinstance(web_urls, list):
	web_urls = [web_urls]

	# Url can be a single string or list
	website_loader = UnstructuredURLLoader(web_urls)
	website_raw = website_loader.load()

	website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap)

	# Return individual docs or list
	return website_data

	# %% ../nbs/media_stores.ipynb 45
	def get_document_segments(context_info, data_type, chunk_size = 1500, chunk_overlap=100):

	load_fcn = None
	addtnl_params = {'chunk_size': chunk_size, 'chunk_overlap': chunk_overlap}

	# Define function use to do the loading
	if data_type == 'text':
	load_fcn = rawtext_to_doc_split
	elif data_type == 'web_page':
	load_fcn = website_to_text_unstructured
	elif data_type == 'youtube_video':
	load_fcn = youtube_to_text
	else:
	load_fcn = files_to_text

	# Get the document segments
	doc_segments = load_fcn(context_info, **addtnl_params)

	return doc_segments

	# %% ../nbs/media_stores.ipynb 47
	def create_local_vector_store(document_segments, **retriever_kwargs):
	embeddings = OpenAIEmbeddings()
	db = Chroma.from_documents(document_segments, embeddings)
	retriever = db.as_retriever(**retriever_kwargs)

	return db, retriever