Spaces:

agungpambudi
/

LLM-YouTube-Analysis

Runtime error

App Files Files Community

LLM-YouTube-Analysis / app.py

agungpambudi

Update app.py

627fe18 verified 10 months ago

raw

history blame contribute delete

8.46 kB

	import os
	import threading
	import time
	import subprocess

	print("Expanding user path for Ollama")
	OLLAMA = os.path.expanduser("~/ollama")

	print("Checking if Ollama exists at the path")
	if not os.path.exists(OLLAMA):
	print("Ollama not found, downloading it")
	subprocess.run("pwd", shell=True)
	subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o /home/user/app/ollama-linux-amd64.tgz", shell=True)
	subprocess.run("tar -xzf /home/user/app/ollama-linux-amd64.tgz", shell=True)
	subprocess.run("ls -lash", shell=True)
	time.sleep(10)

	def ollama_service_thread():
	print("Starting Ollama service thread")
	subprocess.run("/home/user/app/bin/ollama serve", shell=True)

	print("Creating and starting Ollama service thread")
	OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
	OLLAMA_SERVICE_THREAD.start()

	print("Giving Ollama serve a moment to start")
	time.sleep(10)

	print("Setting model to 'llama3'")
	model = "llama3"

	print(f"Pulling model {model}")
	subprocess.run(f"/home/user/app/bin/ollama pull {model}", shell=True)
	subprocess.run(f"/home/user/app/bin/ollama pull mxbai-embed-large", shell=True)

	import gradio as gr
	import textwrap

	# Importing specific components from the LangChain library. Aliasing them with shorter names for convenience
	from langchain_community.document_loaders import YoutubeLoader as YLoader
	from langchain_community.document_loaders import RecursiveUrlLoader
	from youtube_transcript_api import YouTubeTranscriptApi
	from langchain_core.documents import Document # from langchain.schema.document import Document from langchain.docstore.document import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter as RCTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain.chains.summarize import load_summarize_chain
	from langchain_ollama import OllamaLLM
	from langchain_ollama import OllamaEmbeddings
	from langchain.chains import RetrievalQA, ConversationalRetrievalChain

	import warnings
	warnings.filterwarnings('ignore')

	lm = OllamaLLM(
	model = 'llama3',
	base_url = 'http://localhost:11434',
	keep_alive = True
	)

	embeddings = OllamaEmbeddings(
	model = "llama3",
	)

	# Load video information from YouTube
	yt_loader = YLoader.from_youtube_url("https://www.youtube.com/watch?v=AyOnug-3OKM", add_video_info=False)
	# video_data = yt_loader.load()
	video_data = [Document(metadata={'source': 'AyOnug-3OKM'}, page_content="Imagine the wildest idea that you've\never had and you're curious about how it might scale to something that's\nhundred a thousand times bigger. This is a place where\nyou can get to do that. OpenAI is building the general\ntechnologies that can extend the reach of the human brain and our creativity. Dall-E really brings high\nquality creations to anyone. Our coding models serve as a coding\nassistant in tools like Co-Pilot. And GPT3 is a system that has a\nrobust understanding of language. Our mission is to create highly capable\nAI technologies and deploy them to the world, for the benefit of humanity in a\nway that is safe and beneficial for all. I think that AI will be a\ntechnological revolution on the scale of the agricultural, the\nindustrial, the computer revolution. And we have no goal other than\nthe creation and deployment of safe, beneficial AGI. We have a non-profit that governs\neverything, and the ability to block deployments that would make commercial\nsense but we think create safety issues. Our whole structure is around that. I really like the way OpenAI\nare trying to build AGI. Namely by harnessing large amounts\nof real world data to train powerful unsupervised models, and then steering\nthose with human alignment to create agents that can actually do useful things. OpenAI is truly unique in\nthat sense of being able to work in a very nimble fashion. At the same time having all the resources\nin the world to solve the problems. That mentality is what made it\nreally possible to ship Dall-E. At OpenAI we felt very strongly that\nengineering is an integral part to the success of any progress in AI. And that research needs to be infused\nwith engineering and engineering needs to be infused with research. In my first month I did a kind of an\noutrageously large scaling project. They were like yeah we have a\npiece of software that runs on one computer can you make it run\non 3000 and you have three weeks? So that was wild. I don't think you need a specific\nmachine learning background to come here to OpenAI and succeed. I had just taken one class in college. People were very willing\nto help and teach. And I worked on Co-Pilot, one of the\nmore successful products that are based on ML that people are using day-to-day. People are really both friendly\nand approachable and ambitious. There's this shared sense of we're\ngoing to do something very very big together, we've set our sights\nhigh, and also we're collaborating together to make that happen. The number of times I've been\nsitting at lunch and you overhear the other team talking about something\nand you learn a trick from that. Just kind of going on the signal of what\nis everyone else excited about is already a really strong sense of what do I need to\npay attention, to what do I need to learn. For the systems that we're developing\nto have a big impact we have to figure out how to make them\naccessible and how to do so safely. So it's very important to work with\na set of people that have diverse experiences, because we're really\nentering an era where we have to innovate in almost every aspect of society. I truly believe that the technology\nwe are creating is going to have a really profound impact. Can we create an environment\nwhere we have AI and technology broadly amplifying human wisdom? Giving humans another resource for\nconnecting with what's actually important to them, what actually gives them meaning. And we need a lot of different\nkinds of people to make that happen.")]

	# video_id = 'W7ppd_RY-UE' # Replace with your video ID
	# transcript = YouTubeTranscriptApi.get_transcript(video_id)

	# Verify video data
	if not video_data:
	raise ValueError("Failed to load video data. Please check the YouTube URL and loader.")


	# Split text content into documents
	text_splitter = RCTextSplitter(chunk_size=1024, chunk_overlap=64)
	documents = text_splitter.split_documents(video_data)

	# Check if documents are valid
	if not documents:
	raise ValueError("No documents could be extracted. Check the text splitter configuration.")

	print(f"# of documents = {len(documents)}")

	transcript = documents[0].page_content
	line_width = 120
	print(textwrap.fill(transcript[:2000], line_width))

	# Embed text documents using embeddings model
	embedding_list = embeddings.embed_documents([doc.page_content for doc in documents])

	# Handle empty embedding list
	if not embedding_list:
	raise ValueError("Embeddings could not be generated. Please check the embedding model and document content.")

	print(f"{len(embedding_list)} embeddings")
	print(f"Here's a sample of one: {embedding_list[0][:10]}...")

	# Create Chroma vector store from documents and embeddings
	vector_store = Chroma.from_documents(documents, embeddings)
	retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 2})

	summary_chain = load_summarize_chain(llm=lm, chain_type='map_reduce', verbose=False)

	# Initialize RetrievalQA chain for question answering
	qa_chain = RetrievalQA.from_chain_type(llm=lm, chain_type="stuff", retriever=retriever, return_source_documents=True)

	# Define a function for semantic question answering
	def semantic_ask(question, print_results=True):
	video_subset = qa_chain({"query": question})
	context = video_subset
	prompt = f"""
	Answer the following question in a detailed manner, using information from the text below. If the answer is not in the text, say I don't know and do not generate your own response.

	Question:
	{question}
	Text:
	{context}

	Question:
	{question}

	Answer:
	"""

	response = lm.predict(prompt).strip()
	return {
	"answer": response
	}

	# Define a function to get response for input text
	def get_response(input_text):
	response = semantic_ask(input_text)
	return response

	# Create Gradio interface for the question answering function
	gr_interface = gr.Interface(fn=get_response, inputs="text", outputs="text")
	gr_interface.launch()