eklyman commited on
Commit
b8371dd
1 Parent(s): 5284eff

first commit

Browse files
.chainlit/config.toml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: true). No personal data is collected.
3
+ enable_telemetry = true
4
+
5
+
6
+ # List of environment variables to be provided by each user to use the app.
7
+ user_env = []
8
+
9
+ # Duration (in seconds) during which the session is saved when the connection is lost
10
+ session_timeout = 3600
11
+
12
+ # Enable third parties caching (e.g LangChain cache)
13
+ cache = false
14
+
15
+ # Authorized origins
16
+ allow_origins = ["*"]
17
+
18
+ # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
19
+ # follow_symlink = false
20
+
21
+ [features]
22
+ # Show the prompt playground
23
+ prompt_playground = true
24
+
25
+ # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
26
+ unsafe_allow_html = false
27
+
28
+ # Process and display mathematical expressions. This can clash with "$" characters in messages.
29
+ latex = false
30
+
31
+ # Authorize users to upload files with messages
32
+ multi_modal = true
33
+
34
+ # Allows user to use speech to text
35
+ [features.speech_to_text]
36
+ enabled = false
37
+ # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
38
+ # language = "en-US"
39
+
40
+ [UI]
41
+ # Name of the app and chatbot.
42
+ name = "Chatbot"
43
+
44
+ # Show the readme while the thread is empty.
45
+ show_readme_as_default = true
46
+
47
+ # Description of the app and chatbot. This is used for HTML tags.
48
+ # description = ""
49
+
50
+ # Large size content are by default collapsed for a cleaner ui
51
+ default_collapse_content = true
52
+
53
+ # The default value for the expand messages settings.
54
+ default_expand_messages = false
55
+
56
+ # Hide the chain of thought details from the user in the UI.
57
+ hide_cot = false
58
+
59
+ # Link to your github repo. This will add a github button in the UI's header.
60
+ # github = ""
61
+
62
+ # Specify a CSS file that can be used to customize the user interface.
63
+ # The CSS file can be served from the public directory or via an external link.
64
+ # custom_css = "/public/test.css"
65
+
66
+ # Specify a Javascript file that can be used to customize the user interface.
67
+ # The Javascript file can be served from the public directory.
68
+ # custom_js = "/public/test.js"
69
+
70
+ # Specify a custom font url.
71
+ # custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap"
72
+
73
+ # Override default MUI light theme. (Check theme.ts)
74
+ [UI.theme]
75
+ #font_family = "Inter, sans-serif"
76
+ [UI.theme.light]
77
+ #background = "#FAFAFA"
78
+ #paper = "#FFFFFF"
79
+
80
+ [UI.theme.light.primary]
81
+ #main = "#F80061"
82
+ #dark = "#980039"
83
+ #light = "#FFE7EB"
84
+
85
+ # Override default MUI dark theme. (Check theme.ts)
86
+ [UI.theme.dark]
87
+ #background = "#FAFAFA"
88
+ #paper = "#FFFFFF"
89
+
90
+ [UI.theme.dark.primary]
91
+ #main = "#F80061"
92
+ #dark = "#980039"
93
+ #light = "#FFE7EB"
94
+
95
+
96
+ [meta]
97
+ generated_by = "1.0.401"
.chainlit/translations/en-US.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "components": {
3
+ "atoms": {
4
+ "buttons": {
5
+ "userButton": {
6
+ "menu": {
7
+ "settings": "Settings",
8
+ "settingsKey": "S",
9
+ "APIKeys": "API Keys",
10
+ "logout": "Logout"
11
+ }
12
+ }
13
+ }
14
+ },
15
+ "molecules": {
16
+ "newChatButton": {
17
+ "newChat": "New Chat"
18
+ },
19
+ "tasklist": {
20
+ "TaskList": {
21
+ "title": "\ud83d\uddd2\ufe0f Task List",
22
+ "loading": "Loading...",
23
+ "error": "An error occured"
24
+ }
25
+ },
26
+ "attachments": {
27
+ "cancelUpload": "Cancel upload",
28
+ "removeAttachment": "Remove attachment"
29
+ },
30
+ "newChatDialog": {
31
+ "createNewChat": "Create new chat?",
32
+ "clearChat": "This will clear the current messages and start a new chat.",
33
+ "cancel": "Cancel",
34
+ "confirm": "Confirm"
35
+ },
36
+ "settingsModal": {
37
+ "expandMessages": "Expand Messages",
38
+ "hideChainOfThought": "Hide Chain of Thought",
39
+ "darkMode": "Dark Mode"
40
+ }
41
+ },
42
+ "organisms": {
43
+ "chat": {
44
+ "history": {
45
+ "index": {
46
+ "lastInputs": "Last Inputs",
47
+ "noInputs": "Such empty...",
48
+ "loading": "Loading..."
49
+ }
50
+ },
51
+ "inputBox": {
52
+ "input": {
53
+ "placeholder": "Type your message here..."
54
+ },
55
+ "speechButton": {
56
+ "start": "Start recording",
57
+ "stop": "Stop recording"
58
+ },
59
+ "SubmitButton": {
60
+ "sendMessage": "Send message",
61
+ "stopTask": "Stop Task"
62
+ },
63
+ "UploadButton": {
64
+ "attachFiles": "Attach files"
65
+ },
66
+ "waterMark": {
67
+ "text": "Built with"
68
+ }
69
+ },
70
+ "Messages": {
71
+ "index": {
72
+ "running": "Running",
73
+ "executedSuccessfully": "executed successfully",
74
+ "failed": "failed",
75
+ "feedbackUpdated": "Feedback updated",
76
+ "updating": "Updating"
77
+ }
78
+ },
79
+ "dropScreen": {
80
+ "dropYourFilesHere": "Drop your files here"
81
+ },
82
+ "index": {
83
+ "failedToUpload": "Failed to upload",
84
+ "cancelledUploadOf": "Cancelled upload of",
85
+ "couldNotReachServer": "Could not reach the server",
86
+ "continuingChat": "Continuing previous chat"
87
+ },
88
+ "settings": {
89
+ "settingsPanel": "Settings panel",
90
+ "reset": "Reset",
91
+ "cancel": "Cancel",
92
+ "confirm": "Confirm"
93
+ }
94
+ },
95
+ "threadHistory": {
96
+ "sidebar": {
97
+ "filters": {
98
+ "FeedbackSelect": {
99
+ "feedbackAll": "Feedback: All",
100
+ "feedbackPositive": "Feedback: Positive",
101
+ "feedbackNegative": "Feedback: Negative"
102
+ },
103
+ "SearchBar": {
104
+ "search": "Search"
105
+ }
106
+ },
107
+ "DeleteThreadButton": {
108
+ "confirmMessage": "This will delete the thread as well as it's messages and elements.",
109
+ "cancel": "Cancel",
110
+ "confirm": "Confirm",
111
+ "deletingChat": "Deleting chat",
112
+ "chatDeleted": "Chat deleted"
113
+ },
114
+ "index": {
115
+ "pastChats": "Past Chats"
116
+ },
117
+ "ThreadList": {
118
+ "empty": "Empty..."
119
+ },
120
+ "TriggerButton": {
121
+ "closeSidebar": "Close sidebar",
122
+ "openSidebar": "Open sidebar"
123
+ }
124
+ },
125
+ "Thread": {
126
+ "backToChat": "Go back to chat",
127
+ "chatCreatedOn": "This chat was created on"
128
+ }
129
+ },
130
+ "header": {
131
+ "chat": "Chat",
132
+ "readme": "Readme"
133
+ }
134
+ }
135
+ },
136
+ "hooks": {
137
+ "useLLMProviders": {
138
+ "failedToFetchProviders": "Failed to fetch providers:"
139
+ }
140
+ },
141
+ "pages": {
142
+ "Design": {},
143
+ "Env": {
144
+ "savedSuccessfully": "Saved successfully",
145
+ "requiredApiKeys": "Required API Keys",
146
+ "requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage."
147
+ },
148
+ "Page": {
149
+ "notPartOfProject": "You are not part of this project."
150
+ },
151
+ "ResumeButton": {
152
+ "resumeChat": "Resume Chat"
153
+ }
154
+ }
155
+ }
.chainlit/translations/pt-BR.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "components": {
3
+ "atoms": {
4
+ "buttons": {
5
+ "userButton": {
6
+ "menu": {
7
+ "settings": "Configura\u00e7\u00f5es",
8
+ "settingsKey": "S",
9
+ "APIKeys": "Chaves de API",
10
+ "logout": "Sair"
11
+ }
12
+ }
13
+ }
14
+ },
15
+ "molecules": {
16
+ "newChatButton": {
17
+ "newChat": "Nova Conversa"
18
+ },
19
+ "tasklist": {
20
+ "TaskList": {
21
+ "title": "\ud83d\uddd2\ufe0f Lista de Tarefas",
22
+ "loading": "Carregando...",
23
+ "error": "Ocorreu um erro"
24
+ }
25
+ },
26
+ "attachments": {
27
+ "cancelUpload": "Cancelar envio",
28
+ "removeAttachment": "Remover anexo"
29
+ },
30
+ "newChatDialog": {
31
+ "createNewChat": "Criar novo chat?",
32
+ "clearChat": "Isso limpar\u00e1 as mensagens atuais e iniciar\u00e1 uma nova conversa.",
33
+ "cancel": "Cancelar",
34
+ "confirm": "Confirmar"
35
+ },
36
+ "settingsModal": {
37
+ "expandMessages": "Expandir Mensagens",
38
+ "hideChainOfThought": "Esconder Sequ\u00eancia de Pensamento",
39
+ "darkMode": "Modo Escuro"
40
+ }
41
+ },
42
+ "organisms": {
43
+ "chat": {
44
+ "history": {
45
+ "index": {
46
+ "lastInputs": "\u00daltimas Entradas",
47
+ "noInputs": "Vazio...",
48
+ "loading": "Carregando..."
49
+ }
50
+ },
51
+ "inputBox": {
52
+ "input": {
53
+ "placeholder": "Digite sua mensagem aqui..."
54
+ },
55
+ "speechButton": {
56
+ "start": "Iniciar grava\u00e7\u00e3o",
57
+ "stop": "Parar grava\u00e7\u00e3o"
58
+ },
59
+ "SubmitButton": {
60
+ "sendMessage": "Enviar mensagem",
61
+ "stopTask": "Parar Tarefa"
62
+ },
63
+ "UploadButton": {
64
+ "attachFiles": "Anexar arquivos"
65
+ },
66
+ "waterMark": {
67
+ "text": "Constru\u00eddo com"
68
+ }
69
+ },
70
+ "Messages": {
71
+ "index": {
72
+ "running": "Executando",
73
+ "executedSuccessfully": "executado com sucesso",
74
+ "failed": "falhou",
75
+ "feedbackUpdated": "Feedback atualizado",
76
+ "updating": "Atualizando"
77
+ }
78
+ },
79
+ "dropScreen": {
80
+ "dropYourFilesHere": "Solte seus arquivos aqui"
81
+ },
82
+ "index": {
83
+ "failedToUpload": "Falha ao enviar",
84
+ "cancelledUploadOf": "Envio cancelado de",
85
+ "couldNotReachServer": "N\u00e3o foi poss\u00edvel conectar ao servidor",
86
+ "continuingChat": "Continuando o chat anterior"
87
+ },
88
+ "settings": {
89
+ "settingsPanel": "Painel de Configura\u00e7\u00f5es",
90
+ "reset": "Redefinir",
91
+ "cancel": "Cancelar",
92
+ "confirm": "Confirmar"
93
+ }
94
+ },
95
+ "threadHistory": {
96
+ "sidebar": {
97
+ "filters": {
98
+ "FeedbackSelect": {
99
+ "feedbackAll": "Feedback: Todos",
100
+ "feedbackPositive": "Feedback: Positivo",
101
+ "feedbackNegative": "Feedback: Negativo"
102
+ },
103
+ "SearchBar": {
104
+ "search": "Buscar"
105
+ }
106
+ },
107
+ "DeleteThreadButton": {
108
+ "confirmMessage": "Isso deletar\u00e1 a conversa, assim como suas mensagens e elementos.",
109
+ "cancel": "Cancelar",
110
+ "confirm": "Confirmar",
111
+ "deletingChat": "Deletando conversa",
112
+ "chatDeleted": "Conversa deletada"
113
+ },
114
+ "index": {
115
+ "pastChats": "Conversas Anteriores"
116
+ },
117
+ "ThreadList": {
118
+ "empty": "Vazio..."
119
+ },
120
+ "TriggerButton": {
121
+ "closeSidebar": "Fechar barra lateral",
122
+ "openSidebar": "Abrir barra lateral"
123
+ }
124
+ },
125
+ "Thread": {
126
+ "backToChat": "Voltar para a conversa",
127
+ "chatCreatedOn": "Esta conversa foi criada em"
128
+ }
129
+ },
130
+ "header": {
131
+ "chat": "Conversa",
132
+ "readme": "Leia-me"
133
+ }
134
+ },
135
+ "hooks": {
136
+ "useLLMProviders": {
137
+ "failedToFetchProviders": "Falha ao buscar provedores:"
138
+ }
139
+ },
140
+ "pages": {
141
+ "Design": {},
142
+ "Env": {
143
+ "savedSuccessfully": "Salvo com sucesso",
144
+ "requiredApiKeys": "Chaves de API necess\u00e1rias",
145
+ "requiredApiKeysInfo": "Para usar este aplicativo, as seguintes chaves de API s\u00e3o necess\u00e1rias. As chaves s\u00e3o armazenadas localmente em seu dispositivo."
146
+ },
147
+ "Page": {
148
+ "notPartOfProject": "Voc\u00ea n\u00e3o faz parte deste projeto."
149
+ },
150
+ "ResumeButton": {
151
+ "resumeChat": "Continuar Conversa"
152
+ }
153
+ }
154
+ }
155
+ }
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ venv/
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Pt Assistant Demo
3
  emoji: 📚
4
  colorFrom: red
5
  colorTo: blue
6
  sdk: docker
7
- pinned: false
8
  license: openrail
9
  ---
10
 
 
1
  ---
2
+ title: PT Assistant Demo
3
  emoji: 📚
4
  colorFrom: red
5
  colorTo: blue
6
  sdk: docker
7
+ pinned: true
8
  license: openrail
9
  ---
10
 
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+
5
+ import chainlit as cl
6
+ from dotenv import load_dotenv
7
+ from langchain.embeddings import CacheBackedEmbeddings
8
+ from langchain.prompts import ChatPromptTemplate
9
+ from langchain.schema import StrOutputParser
10
+ from langchain.schema.runnable import RunnablePassthrough
11
+ from langchain.schema.runnable.config import RunnableConfig
12
+ from langchain.storage import LocalFileStore
13
+ from langchain_core.runnables import RunnableParallel
14
+ from langchain_openai import ChatOpenAI
15
+ from langchain_openai import OpenAIEmbeddings
16
+ from langchain_pinecone import PineconeVectorStore
17
+ import requests
18
+
19
+ load_dotenv()
20
+
21
+ RAG_PROMPT = """
22
+ CONTEXT:
23
+ {context}
24
+ QUERY:
25
+ {question}
26
+
27
+ You are a helpful assistant and you provide summarized
28
+ and succint information. Your answers are accurate yet
29
+ brief, to ensure the reader can obtain the high level
30
+ responses. You will be presented with a question helping
31
+ one of our customers, and your job is to be as detailed
32
+ and as helpful as possible. If the context provided
33
+ doesn't answer the question, please respond with: "I
34
+ require more information in order to better assist you,
35
+ please state your question and what kind of service or
36
+ support you are seeking."
37
+ """
38
+
39
+ core_embeddings_model = OpenAIEmbeddings()
40
+ vector_store = PineconeVectorStore(index_name=os.getenv(
41
+ "INDEX_NAME"), embedding=core_embeddings_model)
42
+ retriever = vector_store.as_retriever(search_kwargs={"k": 1})
43
+
44
+ llm = ChatOpenAI(streaming=True, model=os.getenv("OPENAI_MODEL"))
45
+ rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
46
+
47
+
48
+ def format_docs(docs):
49
+ return "\n\n".join(doc for doc in docs)
50
+
51
+
52
+ rag_chain_from_docs = (
53
+ RunnablePassthrough.assign(context=(lambda x: format_docs(x)))
54
+ | rag_prompt
55
+ | llm
56
+ | StrOutputParser()
57
+ )
58
+
59
+ rag_chain_with_source = RunnableParallel(
60
+ {"context": retriever, "question": RunnablePassthrough()}
61
+ ).assign(answer=rag_chain_from_docs)
62
+
63
+
64
+ @cl.on_chat_start
65
+ async def on_chat_start():
66
+ cl.user_session.set("runnable", rag_chain_with_source)
67
+
68
+
69
+ @cl.on_message
70
+ async def on_message(message: cl.Message):
71
+ rag_chain_with_source = cl.user_session.get("runnable")
72
+ msg = cl.Message(content="")
73
+ response = rag_chain_with_source.invoke(message.content)
74
+
75
+ if response is not None:
76
+ metadata = response["context"][0].metadata
77
+ link = metadata["link"]
78
+ title = metadata["source_document"]
79
+
80
+ formatted_response = f"{response['answer']} Check out the Youtube video [{title}]({
81
+ link}!"
82
+
83
+ match = re.search(r"<(\d+\.\d+)>", response["context"][0].page_content)
84
+ if match:
85
+ start_time = float(match.group(1))
86
+ formatted_response = f"{response['answer']} Check out the Youtube video [{
87
+ title}]({link}&t={int(start_time)}s)!"
88
+
89
+ msg.content = formatted_response
90
+ await msg.send()
chainlit.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # PT DEMO! 🚀
2
+
3
+ Hello from your friendly PT Assistant! 👋
4
+
5
+ Enter a question to begin chatting!
6
+
7
+ Examples:
8
+ - Who is Aaron LaBauer?
9
+ - Is Aaron an LMT?
10
+ - What does Aaron LaBauer do professionally?
requirements.txt ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ annotated-types==0.7.0
5
+ anyio==3.7.1
6
+ asyncer==0.0.2
7
+ attrs==23.2.0
8
+ bidict==0.23.1
9
+ certifi==2024.2.2
10
+ chainlit==1.1.202
11
+ charset-normalizer==3.3.2
12
+ chevron==0.14.0
13
+ click==8.1.7
14
+ dataclasses-json==0.5.14
15
+ Deprecated==1.2.14
16
+ distro==1.9.0
17
+ fastapi==0.110.3
18
+ fastapi-socketio==0.0.10
19
+ filetype==1.2.0
20
+ frozenlist==1.4.1
21
+ googleapis-common-protos==1.63.0
22
+ grpcio==1.64.0
23
+ h11==0.14.0
24
+ httpcore==1.0.5
25
+ httpx==0.27.0
26
+ idna==3.7
27
+ importlib-metadata==7.0.0
28
+ install==1.3.5
29
+ jsonpatch==1.33
30
+ jsonpointer==2.4
31
+ langchain==0.2.1
32
+ langchain-core==0.2.1
33
+ langchain-openai==0.1.7
34
+ langchain-pinecone==0.1.1
35
+ langchain-text-splitters==0.2.0
36
+ langsmith==0.1.63
37
+ Lazify==0.4.0
38
+ literalai==0.0.601
39
+ marshmallow==3.21.2
40
+ multidict==6.0.5
41
+ mypy-extensions==1.0.0
42
+ nest-asyncio==1.6.0
43
+ numpy==1.26.4
44
+ openai==1.30.3
45
+ opentelemetry-api==1.24.0
46
+ opentelemetry-exporter-otlp==1.24.0
47
+ opentelemetry-exporter-otlp-proto-common==1.24.0
48
+ opentelemetry-exporter-otlp-proto-grpc==1.24.0
49
+ opentelemetry-exporter-otlp-proto-http==1.24.0
50
+ opentelemetry-instrumentation==0.45b0
51
+ opentelemetry-proto==1.24.0
52
+ opentelemetry-sdk==1.24.0
53
+ opentelemetry-semantic-conventions==0.45b0
54
+ orjson==3.10.3
55
+ packaging==23.2
56
+ pinecone-client==3.2.2
57
+ protobuf==4.25.3
58
+ pydantic==2.7.1
59
+ pydantic_core==2.18.2
60
+ PyJWT==2.8.0
61
+ python-dotenv==1.0.1
62
+ python-engineio==4.9.1
63
+ python-multipart==0.0.9
64
+ python-socketio==5.11.2
65
+ PyYAML==6.0.1
66
+ regex==2024.5.15
67
+ requests==2.32.2
68
+ setuptools==70.0.0
69
+ simple-websocket==1.0.0
70
+ sniffio==1.3.1
71
+ SQLAlchemy==2.0.30
72
+ starlette==0.37.2
73
+ syncer==2.0.3
74
+ tenacity==8.3.0
75
+ tiktoken==0.7.0
76
+ tomli==2.0.1
77
+ tqdm==4.66.4
78
+ typing-inspect==0.9.0
79
+ typing_extensions==4.12.0
80
+ uptrace==1.24.0
81
+ urllib3==2.2.1
82
+ uvicorn==0.25.0
83
+ watchfiles==0.20.0
84
+ wrapt==1.16.0
85
+ wsproto==1.2.0
86
+ yarl==1.9.4
87
+ zipp==3.19.0
youtube_to_docstore.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ from dotenv import load_dotenv
5
+ from langchain.embeddings.cache import CacheBackedEmbeddings
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain.storage import LocalFileStore
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ import pandas as pd
10
+ from pinecone import Pinecone, ServerlessSpec
11
+ import requests
12
+ import scrapetube
13
+ from uuid import uuid4
14
+ from youtube_transcript_api import YouTubeTranscriptApi
15
+
16
+ load_dotenv()
17
+
18
+ BATCH_LIMIT = 100
19
+
20
+
21
+ def get_youtube_data(video_id):
22
+
23
+ url = f"https://www.youtube.com/watch?v={video_id}"
24
+
25
+ try:
26
+ raw = YouTubeTranscriptApi.get_transcript(video_id)
27
+ except:
28
+ print(f"No transcript found for {url}")
29
+ return False
30
+
31
+ # Get metadata
32
+ response = requests.get(
33
+ f"https://noembed.com/embed?dataType=json&url={url}")
34
+ data = json.loads(response.content)
35
+ title = data["title"]
36
+
37
+ # ' is a reserved character
38
+ title = title.replace("'", "")
39
+
40
+ df = pd.DataFrame(raw)
41
+
42
+ # Generate the transcript string with timestamps
43
+ transcript = ' '.join(
44
+ f"{row['text']}<{row['start']}>" for _, row in df.iterrows())
45
+
46
+ return transcript, title
47
+
48
+
49
+ def index_video(video_id, embedder, index):
50
+ try:
51
+ print(f"Getting transcript & text for video: {video_id}")
52
+ transcript, title = get_youtube_data(video_id)
53
+ except Exception as e:
54
+ print(f"""Error getting transcript for video {video_id}: {e}""")
55
+ return False
56
+
57
+ url = f"https://www.youtube.com/watch?v={video_id}"
58
+
59
+ text_splitter = RecursiveCharacterTextSplitter(
60
+ chunk_size=1000,
61
+ chunk_overlap=100,
62
+ length_function=len,
63
+ separators=["\n\n", "\n", " ", ""]
64
+ )
65
+
66
+ texts = []
67
+ metadatas = []
68
+
69
+ metadata = {
70
+ 'source_document': title,
71
+ 'link': url
72
+ }
73
+
74
+ record_texts = text_splitter.split_text(transcript)
75
+
76
+ print(f"Split documents into {len(record_texts)} chunks")
77
+
78
+ record_metadatas = [{"chunk": j, "text": text, **metadata}
79
+ for j, text in enumerate(record_texts)]
80
+
81
+ print(f"Uploading {len(record_texts)} chunks to Pinecone...")
82
+
83
+ texts.extend(record_texts)
84
+ metadatas.extend(record_metadatas)
85
+
86
+ ids = [str(uuid4()) for _ in range(len(texts))]
87
+ embeds = embedder.embed_documents(texts)
88
+
89
+ try:
90
+ print("Upserting data to pinecone...")
91
+ index.upsert(vectors=zip(ids, embeds, metadatas))
92
+ except Exception as e:
93
+ print(f"Error upserting data to Pinecone: {e}")
94
+
95
+ if len(texts) >= BATCH_LIMIT:
96
+ texts = []
97
+ metadatas = []
98
+
99
+
100
+ def index_channel(channel_id, embedder, index):
101
+ print("Indexing channel...")
102
+ videos = scrapetube.get_channel(channel_id)
103
+
104
+ for video in videos:
105
+ print(f"Ready to process {video['videoId']}")
106
+ index_video(video["videoId"], embedder, index)
107
+
108
+
109
+ def configure_vector_database():
110
+ print("Configuring Pinecone...")
111
+ pc = Pinecone(
112
+ api_key=os.getenv("PINECONE_API_KEY")
113
+ )
114
+
115
+ if INDEX_NAME not in pc.list_indexes().names():
116
+ pc.create_index(
117
+ name=INDEX_NAME,
118
+ metric='cosine',
119
+ dimension=1536,
120
+ spec=ServerlessSpec(
121
+ cloud="aws", region="us-east-1"
122
+ )
123
+ )
124
+
125
+ index = pc.Index(INDEX_NAME)
126
+ store = LocalFileStore("./cache/")
127
+ # default model is text-embedding-ada-002
128
+ core_embeddings_model = OpenAIEmbeddings()
129
+
130
+ embedder = CacheBackedEmbeddings.from_bytes_store(
131
+ core_embeddings_model,
132
+ store,
133
+ namespace=core_embeddings_model.model
134
+ )
135
+
136
+ return embedder, index
137
+
138
+
139
+ embedder, index = configure_vector_database()
140
+ index_channel(os.getenv["CHANNEL_ID"], embedder, index)