Spaces:
Sleeping
Sleeping
Commit
Β·
6961452
1
Parent(s):
7fc2087
Yt video source added
Browse files- .gitignore +5 -1
- Dockerfile +17 -0
- Example/__init__.py +0 -0
- Example/rag_example.py +10 -0
- Rag/{chunking.py β rag_pipeline.py} +19 -13
- requirements.txt +1 -0
- utils/__init__.py +0 -0
- {Rag β utils}/corefrence.py +1 -1
- utils/get_link.py +11 -0
- {Rag β utils}/summarization.py +0 -0
.gitignore
CHANGED
|
@@ -129,7 +129,11 @@ Rag/chromadb.db/
|
|
| 129 |
|
| 130 |
# mkdocs documentation
|
| 131 |
/site
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
# mypy
|
| 134 |
.mypy_cache/
|
| 135 |
.dmypy.json
|
|
|
|
| 129 |
|
| 130 |
# mkdocs documentation
|
| 131 |
/site
|
| 132 |
+
__pycache__/
|
| 133 |
+
*.pyc
|
| 134 |
+
*.pyo
|
| 135 |
+
*.pyd
|
| 136 |
+
.env
|
| 137 |
# mypy
|
| 138 |
.mypy_cache/
|
| 139 |
.dmypy.json
|
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a base image
|
| 2 |
+
FROM python:3.9-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory in the container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Copy the requirements file into the container
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Install dependencies
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Copy the rest of your application
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# Command to run your application
|
| 17 |
+
CMD ["python", "-m", "Rag"]
|
Example/__init__.py
ADDED
|
File without changes
|
Example/rag_example.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import chromadb
|
| 2 |
+
transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
| 3 |
+
chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
|
| 4 |
+
client = chromadb.PersistentClient(path=chromadb_path)
|
| 5 |
+
collection = client.get_or_create_collection(name="yt_transcript_collection")
|
| 6 |
+
from Rag.rag_pipeline import main_workflow
|
| 7 |
+
|
| 8 |
+
# Run the application
|
| 9 |
+
if __name__ == "__main__":
|
| 10 |
+
main_workflow(transcripts_folder_path, collection)
|
Rag/{chunking.py β rag_pipeline.py}
RENAMED
|
@@ -3,18 +3,17 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
import google.generativeai as genai
|
| 5 |
import os
|
| 6 |
-
import json
|
| 7 |
import logging
|
| 8 |
from Llm.llm_endpoints import get_llm_response
|
| 9 |
-
from
|
| 10 |
-
from Rag.corefrence import resolve_coreference_in_query
|
| 11 |
# Configuration
|
| 12 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 13 |
if API_KEY:
|
| 14 |
genai.configure(api_key=API_KEY)
|
| 15 |
|
| 16 |
chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
|
| 17 |
-
transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
| 18 |
processed_files_path = "/home/nightwing/Codes/Xyzbot/Rag/Processed_folder/processed_files.json"
|
| 19 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 20 |
|
|
@@ -88,17 +87,22 @@ def update_conversation_history(history, user_query, bot_response):
|
|
| 88 |
return history
|
| 89 |
|
| 90 |
|
| 91 |
-
def generate_response(conversation_history, query_text, retrieved_docs):
|
| 92 |
"""Generate a response using retrieved documents and the generative AI model."""
|
| 93 |
|
| 94 |
context = " ".join(retrieved_docs)
|
| 95 |
history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
|
|
|
|
|
|
| 96 |
prompt = f"""
|
| 97 |
Using the context below and the conversation history, answer the question:
|
| 98 |
|
| 99 |
Context:
|
| 100 |
{context}
|
| 101 |
|
|
|
|
|
|
|
|
|
|
| 102 |
Conversation History:
|
| 103 |
{history_str}
|
| 104 |
|
|
@@ -106,7 +110,10 @@ def generate_response(conversation_history, query_text, retrieved_docs):
|
|
| 106 |
"""
|
| 107 |
|
| 108 |
response = get_llm_response(prompt)
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
|
| 112 |
# Main Workflow
|
|
@@ -128,20 +135,19 @@ def main_workflow(transcripts_folder_path, collection):
|
|
| 128 |
print("Ending the conversation. Goodbye")
|
| 129 |
break
|
| 130 |
query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
|
| 131 |
-
resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
|
| 132 |
-
retrived_docs, metadatas = query_database(collection,
|
| 133 |
print("-" * 50)
|
| 134 |
-
|
|
|
|
| 135 |
print("-" * 50)
|
| 136 |
if not retrived_docs:
|
| 137 |
print("No relevent documents is found")
|
| 138 |
continue
|
| 139 |
-
response = generate_response(conversation_history, query_text, retrived_docs)
|
| 140 |
conversation_history = update_conversation_history(conversation_history, query_text, response)
|
| 141 |
print("\nGenerated Response:")
|
| 142 |
print(response)
|
| 143 |
|
| 144 |
|
| 145 |
-
|
| 146 |
-
if __name__ == "__main__":
|
| 147 |
-
main_workflow(transcripts_folder_path, collection)
|
|
|
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
import google.generativeai as genai
|
| 5 |
import os
|
|
|
|
| 6 |
import logging
|
| 7 |
from Llm.llm_endpoints import get_llm_response
|
| 8 |
+
from utils.get_link import get_source_link
|
| 9 |
+
# from Rag.corefrence import resolve_coreference_in_query
|
| 10 |
# Configuration
|
| 11 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 12 |
if API_KEY:
|
| 13 |
genai.configure(api_key=API_KEY)
|
| 14 |
|
| 15 |
chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
|
| 16 |
+
# transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
| 17 |
processed_files_path = "/home/nightwing/Codes/Xyzbot/Rag/Processed_folder/processed_files.json"
|
| 18 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 19 |
|
|
|
|
| 87 |
return history
|
| 88 |
|
| 89 |
|
| 90 |
+
def generate_response(conversation_history, query_text, retrieved_docs, source_links):
|
| 91 |
"""Generate a response using retrieved documents and the generative AI model."""
|
| 92 |
|
| 93 |
context = " ".join(retrieved_docs)
|
| 94 |
history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
| 95 |
+
sources_str = "\n".join(source_links)
|
| 96 |
+
|
| 97 |
prompt = f"""
|
| 98 |
Using the context below and the conversation history, answer the question:
|
| 99 |
|
| 100 |
Context:
|
| 101 |
{context}
|
| 102 |
|
| 103 |
+
Conversation Sources:
|
| 104 |
+
{sources_str}
|
| 105 |
+
|
| 106 |
Conversation History:
|
| 107 |
{history_str}
|
| 108 |
|
|
|
|
| 110 |
"""
|
| 111 |
|
| 112 |
response = get_llm_response(prompt)
|
| 113 |
+
|
| 114 |
+
# Append sources to the response
|
| 115 |
+
full_response = f"{response}\n\nSources:\n{sources_str}"
|
| 116 |
+
return full_response
|
| 117 |
|
| 118 |
|
| 119 |
# Main Workflow
|
|
|
|
| 135 |
print("Ending the conversation. Goodbye")
|
| 136 |
break
|
| 137 |
query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
|
| 138 |
+
# resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
|
| 139 |
+
retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
|
| 140 |
print("-" * 50)
|
| 141 |
+
source_link = get_source_link(metadatas)
|
| 142 |
+
print(source_link)
|
| 143 |
print("-" * 50)
|
| 144 |
if not retrived_docs:
|
| 145 |
print("No relevent documents is found")
|
| 146 |
continue
|
| 147 |
+
response = generate_response(conversation_history, query_text, retrived_docs, source_link)
|
| 148 |
conversation_history = update_conversation_history(conversation_history, query_text, response)
|
| 149 |
print("\nGenerated Response:")
|
| 150 |
print(response)
|
| 151 |
|
| 152 |
|
| 153 |
+
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -13,3 +13,4 @@ flask_cors
|
|
| 13 |
sentence_transformers
|
| 14 |
tqdm
|
| 15 |
torch
|
|
|
|
|
|
| 13 |
sentence_transformers
|
| 14 |
tqdm
|
| 15 |
torch
|
| 16 |
+
transformers
|
utils/__init__.py
ADDED
|
File without changes
|
{Rag β utils}/corefrence.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from transformers import pipeline
|
| 2 |
|
| 3 |
-
coref_pipeline = pipeline("coref-resolution", model="coref-
|
| 4 |
|
| 5 |
|
| 6 |
def resolve_coreference_in_query(query_text, conversation_history):
|
|
|
|
| 1 |
from transformers import pipeline
|
| 2 |
|
| 3 |
+
coref_pipeline = pipeline("coref-resolution", model="coref-roberta-large")
|
| 4 |
|
| 5 |
|
| 6 |
def resolve_coreference_in_query(query_text, conversation_history):
|
utils/get_link.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def get_source_link(metadatas):
|
| 2 |
+
link = 'https://www.youtube.com/watch?v='
|
| 3 |
+
yt_link = []
|
| 4 |
+
for metadata in metadatas:
|
| 5 |
+
source = metadata['source']
|
| 6 |
+
values = source.split('.txt')
|
| 7 |
+
|
| 8 |
+
link = link + values[0]
|
| 9 |
+
yt_link.append(link)
|
| 10 |
+
# print(yt_link)
|
| 11 |
+
return yt_link
|
{Rag β utils}/summarization.py
RENAMED
|
File without changes
|