madhiemw commited on
Commit
aa2d91f
·
verified ·
1 Parent(s): 0444545

Upload 25 files

Browse files
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from config import Config
3
+ from pipeline.main_pipeline import QAPipeline
4
+ from typing import List
5
+
6
+ def create_app():
7
+ config = Config()
8
+ pipeline = QAPipeline(config)
9
+
10
+ def process_message(message: str, collection_name: str, history: List[List[str]]):
11
+ retrieved_docs, response = pipeline.process(message, collection_name)
12
+ return response, retrieved_docs
13
+
14
+ interface = gr.Interface(
15
+ fn=process_message,
16
+ inputs=[
17
+ gr.Textbox(label="Your Question"),
18
+ gr.Dropdown(choices=["QnA data", "Semantic Data"], label="Select Collection", value="Semantic Data")
19
+ ],
20
+ outputs=[
21
+ gr.Textbox(label="AI Response"),
22
+ gr.Textbox(label="Retrieved Documents from ChromaDB"),
23
+ ],
24
+ title="Muhammad Adhiem Wicaksana Vidavox Technical Test",
25
+ description="Ask questions and choose a crhoma db collection, read my documentation for the purpose of the collection :)",
26
+ )
27
+
28
+ return interface
29
+
30
+ if __name__ == "__main__":
31
+ app = create_app()
32
+ app.launch()
config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Config:
2
+ CHROMADB_HOST = "https://chromadb-production-7af5.up.railway.app"
3
+ CHROMADB_TOKEN = "w587yz1cvfk8tcbf21taesnxmdt2rcsm"
4
+ GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
5
+ GROQ_API_KEY = "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"
6
+ DEFAULT_COLLECTION = "test11"
7
+ N_RESULTS = 5
8
+ MYSQL_HOST = "junction.proxy.rlwy.net"
9
+ MYSQL_USER = "root"
10
+ MYSQL_PASSWORD = "YtxIHzEUaRIJyNKaecBOTylHKGbbEitk"
11
+ MYSQL_DATABASE = "railway"
12
+ MYSQL_PORT = 54826
pipeline/__pycache__/chromadb_search.cpython-311.pyc ADDED
Binary file (1.92 kB). View file
 
pipeline/__pycache__/document_processor.cpython-311.pyc ADDED
Binary file (1.6 kB). View file
 
pipeline/__pycache__/embeddings.cpython-311.pyc ADDED
Binary file (829 Bytes). View file
 
pipeline/__pycache__/generate_sql.cpython-311.pyc ADDED
Binary file (2.89 kB). View file
 
pipeline/__pycache__/groq_client.cpython-311.pyc ADDED
Binary file (2.67 kB). View file
 
pipeline/__pycache__/main_pipeline.cpython-311.pyc ADDED
Binary file (3.8 kB). View file
 
pipeline/__pycache__/qa_pipeline.cpython-311.pyc ADDED
Binary file (3.02 kB). View file
 
pipeline/__pycache__/sql_query.cpython-311.pyc ADDED
Binary file (3.05 kB). View file
 
pipeline/__pycache__/sql_response.cpython-311.pyc ADDED
Binary file (2.56 kB). View file
 
pipeline/chromadb_search.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from typing import List, Dict, Any
3
+ from config import Config
4
+
5
+ class ChromaDBPipeline:
6
+ def __init__(self, config: Config):
7
+ self.client = chromadb.HttpClient(
8
+ host=config.CHROMADB_HOST,
9
+ headers={"Authorization": f"Bearer {config.CHROMADB_TOKEN}"}
10
+ )
11
+ self.n_results = config.N_RESULTS
12
+
13
+ def query(self, embedding: List[float], collection_name: str) -> Dict[str, Any]:
14
+ """Search ChromaDB with embedding"""
15
+ try:
16
+ collection = self.client.get_collection(
17
+ name=collection_name,
18
+ embedding_function=None
19
+ )
20
+ return collection.query(
21
+ query_embeddings=[embedding],
22
+ n_results=self.n_results
23
+ )
24
+ except Exception as e:
25
+ raise Exception(f"ChromaDB search error: {str(e)}")
pipeline/document_processor.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ class DocumentProcessor:
4
+ @staticmethod
5
+ def process(results: dict) -> str:
6
+ """Process and combine ChromaDB results"""
7
+ if not results or 'documents' not in results:
8
+ return ""
9
+
10
+ relevant_documents = results['documents']
11
+ if isinstance(relevant_documents, list):
12
+ if relevant_documents and isinstance(relevant_documents[0], list):
13
+ relevant_documents = [item for sublist in relevant_documents for item in sublist]
14
+ relevant_documents = [str(doc) for doc in relevant_documents]
15
+
16
+ return " ".join(relevant_documents)
pipeline/embeddings.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from utils.sentence_transformer_util import encode_query
2
+
3
+ class EmbeddingPipeline:
4
+ @staticmethod
5
+ def process(text: str):
6
+ """Convert input text to embedding"""
7
+ chat = ("pertanyaan", text)
8
+ return encode_query(chat)
pipeline/groq_client.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from typing import Dict
3
+ from config import Config
4
+ class GroqPipeline:
5
+ def __init__(self, config: Config):
6
+ self.api_url = config.GROQ_API_URL
7
+ self.headers = {
8
+ "Content-Type": "application/json",
9
+ "Authorization": f"Bearer {config.GROQ_API_KEY}"
10
+ }
11
+
12
+ def generate_response(self, context: str, question: str) -> str:
13
+ """Generate response using Groq API"""
14
+ try:
15
+ data = {
16
+ "model": "llama-3.1-8b-instant",
17
+ "messages": [{
18
+ "role": "user",
19
+ "content": (f"embedding result: {context}\n"
20
+ f"Answer the following user question: {question}, "
21
+ "the name of the robot is AV-826 "
22
+ "and answer it as a professional CS."
23
+ "just use embedding result as a fact for the answer and "
24
+ "if the question is has relation with effecency just add '#table' tag as a regex at the end of your output the db contain data about av-826 testing data and performance such as operation temperature and sensor accuracy"
25
+ "dont create any fake data, fake table or fake sql code. just summary the embedding result"
26
+ "if the question doesnt has relation with the robot product give polite feedback")
27
+ }],
28
+ "temperature": 0.1
29
+ }
30
+
31
+ response = requests.post(self.api_url, json=data, headers=self.headers)
32
+
33
+ if response.status_code == 200:
34
+ return response.json()['choices'][0]['message']['content']
35
+ else:
36
+ raise Exception(f"Groq API error: {response.status_code}")
37
+ except Exception as e:
38
+ raise Exception(f"Groq generation error: {str(e)}")
pipeline/main_pipeline.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from config import Config
2
+ from pipeline.embeddings import EmbeddingPipeline
3
+ from pipeline.chromadb_search import ChromaDBPipeline
4
+ from pipeline.document_processor import DocumentProcessor
5
+ from pipeline.groq_client import GroqPipeline
6
+ from pipeline.sql_query import MYSQL_Generator
7
+ from pipeline.sql_response import DB_Response_Generator
8
+ from utils.regex import remove_table_and_text, detect_none_in_text
9
+ from utils.mysql_util import MySQLDatabase_execute
10
+
11
+ class QAPipeline:
12
+ def __init__(self, config: Config):
13
+ self.config = config
14
+ self.embedding_pipeline = EmbeddingPipeline()
15
+ self.chromadb_pipeline = ChromaDBPipeline(config)
16
+ self.document_processor = DocumentProcessor()
17
+ self.groq_pipeline = GroqPipeline(config)
18
+ self.sql_generator = MYSQL_Generator(config)
19
+ self.sql_execute = MySQLDatabase_execute(config)
20
+ self.sql_response = DB_Response_Generator(config)
21
+
22
+ def process(self, question: str, collection_name: str):
23
+ """Run the complete QA pipeline and return retrieved documents + AI response"""
24
+ try:
25
+ if collection_name == "QnA data":
26
+ collection_name = "test11"
27
+ elif collection_name == "Semantic Data":
28
+ collection_name = "test12"
29
+
30
+ embedding = self.embedding_pipeline.process(question)
31
+
32
+ search_results = self.chromadb_pipeline.query(
33
+ embedding,
34
+ collection_name
35
+ )
36
+
37
+ retrieved_text = self.document_processor.process(search_results)
38
+
39
+ if not retrieved_text:
40
+ return "No relevant documents found.", "No relevant information found."
41
+
42
+ response = self.groq_pipeline.generate_response(retrieved_text, question)
43
+ tabledetect = remove_table_and_text(response)
44
+ if tabledetect == False:
45
+ return retrieved_text, response
46
+ else:
47
+ generateqsl = self.sql_generator.generate_response(question)
48
+ if generateqsl is not None:
49
+ sql_execute = self.sql_execute.execute_query(generateqsl)
50
+ if sql_execute == "None":
51
+ return retrieved_text, tabledetect
52
+ else:
53
+ sql_response = self.sql_response.generate_response(context=question, db_result=sql_execute, sqlcode=generateqsl)
54
+ final_response = tabledetect + "\n\n --------connected with SQL Agent--------\n\n--------Database analyzing result--------\n\n" + sql_response
55
+ return retrieved_text, final_response
56
+ else :
57
+ return retrieved_text, tabledetect
58
+
59
+ except Exception as e:
60
+ return f"Error retrieving documents: {str(e)}", f"Pipeline error: {str(e)}"
pipeline/sql_query.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from config import Config
3
+ from utils.regex import extract_sql_query
4
+
5
+ class MYSQL_Generator:
6
+ def __init__(self, config: Config):
7
+ self.api_url = config.GROQ_API_URL
8
+ self.headers = {
9
+ "Content-Type": "application/json",
10
+ "Authorization": f"Bearer {config.GROQ_API_KEY}"
11
+ }
12
+
13
+ def generate_response(self, context: str) -> str:
14
+ """Generate SQL query using Groq API"""
15
+ try:
16
+ data = {
17
+ "model": "llama-3.1-8b-instant",
18
+ "messages": [{
19
+ "role": "user",
20
+ "content": (f"question: {context}\n"
21
+ "the table name is vidavox"
22
+ "the table is about av826 testing data"
23
+ "---below is the column table of MySQL db---"
24
+ "test_id VARCHAR(10) PRIMARY KEY,test_date DATE,model_name VARCHAR(50),noise_level_db DECIMAL(5, 2),cleaning_efficiency_percent DECIMAL(5, 2), battery_duration_minutes INT, area_covered_sqm DECIMAL(10, 2), dust_collection_grams DECIMAL(10, 2), operating_temperature_celsius DECIMAL(5, 2),maintenance_score DECIMAL(5, 2), navigation_accuracy_percent DECIMAL(5, 2),software_version VARCHAR(50)"
25
+ "---and below is the data example from the DB---"
26
+ "('TST0000020', '2024-01-07', 'AV826', 56.9, 95.8, 112, 42.41, 87.0, 26.6, 9, 96.9, 'v2.1.0')"
27
+ "your task is to generate just 1 MySQL code relateed above column and dont create any description to answer user question just use MySQL code "
28
+ "always limit the data only 10 line "
29
+ "dont create any fake mysql code"
30
+ "return none if the question doesn't has relation with the column and example data above"
31
+ )
32
+ }],
33
+ "temperature": 0.1
34
+ }
35
+
36
+ response = requests.post(self.api_url, json=data, headers=self.headers)
37
+
38
+ if response.status_code == 200:
39
+ response_text = response.json()['choices'][0]['message']['content']
40
+ return extract_sql_query(response_text)
41
+ else:
42
+ raise Exception(f"Groq API error: {response.status_code}, {response.text}")
43
+
44
+ except Exception as e:
45
+ return str(e)
pipeline/sql_response.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from config import Config
3
+
4
+ class DB_Response_Generator:
5
+ def __init__(self, config: Config):
6
+ self.api_url = config.GROQ_API_URL
7
+ self.headers = {
8
+ "Content-Type": "application/json",
9
+ "Authorization": f"Bearer {config.GROQ_API_KEY}"
10
+ }
11
+
12
+ def generate_response(self, context: str, db_result:str, sqlcode:str) -> str:
13
+ """Generate SQL query using Groq API"""
14
+ try:
15
+ data = {
16
+ "model": "llama-3.1-8b-instant",
17
+ "messages": [{
18
+ "role": "user",
19
+ "content": (f"question: {context}\n"
20
+ "heres the table column test_id,test_date,model_name,noise_level_db,cleaning_efficiency_percent, battery_duration_minutes INT, area_covered_sqm, dust_collection_grams, operating_temperature_celsius ,maintenance_score, navigation_accuracy_percent ,software_version\n"
21
+ f"and heres the sql code: {sqlcode} "
22
+ "below is result of sql query\n"
23
+ f"{db_result}\n"
24
+ "make summary and use non technical sentence to explain it into user and use it to answer user question ")
25
+ }],
26
+ "temperature": 0.1
27
+ }
28
+
29
+ response = requests.post(self.api_url, json=data, headers=self.headers)
30
+
31
+ if response.status_code == 200:
32
+ response_text = response.json()['choices'][0]['message']['content']
33
+ return response_text
34
+ else:
35
+ raise Exception(f"Groq API error: {response.status_code}, {response.text}")
36
+
37
+ except Exception as e:
38
+ return str(e)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ sentence-transformers
2
+ scikit-learn
3
+ requests
4
+ regex
5
+ chromadb
6
+ mysql-connector-python
utils/__pycache__/mysql_util.cpython-311.pyc ADDED
Binary file (2.86 kB). View file
 
utils/__pycache__/regex.cpython-311.pyc ADDED
Binary file (1.45 kB). View file
 
utils/__pycache__/sentence_transformer_util.cpython-311.pyc ADDED
Binary file (915 Bytes). View file
 
utils/mysql_util.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mysql.connector
2
+ from config import Config
3
+
4
+ class MySQLDatabase_execute:
5
+ def __init__(self, config: Config):
6
+ self.config = config
7
+ self.connection = None
8
+
9
+ def connect(self):
10
+ """Establish connection to MySQL database"""
11
+ try:
12
+ self.connection = mysql.connector.connect(
13
+ host=self.config.MYSQL_HOST,
14
+ user=self.config.MYSQL_USER,
15
+ password=self.config.MYSQL_PASSWORD,
16
+ database=self.config.MYSQL_DATABASE,
17
+ port=self.config.MYSQL_PORT
18
+ )
19
+ print("Connected to MySQL database successfully.")
20
+ except mysql.connector.Error as err:
21
+ print(f"Error: {err}")
22
+
23
+ def execute_query(self, query):
24
+ """Execute the SQL query and return the results"""
25
+ try:
26
+ if self.connection is None:
27
+ self.connect()
28
+
29
+ cursor = self.connection.cursor()
30
+ cursor.execute(query)
31
+ results = cursor.fetchall()
32
+ cursor.close()
33
+ return results
34
+
35
+ except mysql.connector.Error as err:
36
+ print(f"Query Error: {err}")
37
+ return None
38
+
39
+ def close(self):
40
+ """Close the database connection"""
41
+ if self.connection:
42
+ self.connection.close()
43
+ print("Database connection closed.")
utils/regex.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def remove_table_and_text(text):
4
+ pattern = r"#table.*"
5
+ if not re.search(pattern, text, re.DOTALL):
6
+ return False
7
+ cleaned_text = re.sub(pattern, "", text, flags=re.DOTALL).strip()
8
+ return cleaned_text
9
+
10
+
11
+ def extract_sql_query(text):
12
+ """Extract and clean up SQL query from the response text"""
13
+ match = re.search(r'```sql\n(.*?)```', text, re.DOTALL)
14
+ if match:
15
+ return match.group(1).strip()
16
+ return None
17
+ import re
18
+
19
+ def detect_none_in_text(text: str) -> bool:
20
+ if not text:
21
+ return False
22
+
23
+ pattern = r"None"
24
+ return bool(re.search(pattern, text))
utils/sentence_transformer_util.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+ model = SentenceTransformer('LazarusNLP/all-indo-e5-small-v4')
4
+
5
+ def encode_query(query: str):
6
+ """Encodes a query into an embedding vector using SentenceTransformer."""
7
+ try:
8
+ return model.encode([query])[0]
9
+ except Exception as e:
10
+ raise Exception(f"Error encoding query: {str(e)}")