Spaces:
Build error
Build error
jskinner215
commited on
Commit
β’
25fc3a2
1
Parent(s):
73d96e9
Upload 17 files
Browse files- README.md +12 -11
- answer_presenter.py +4 -0
- app.py +30 -0
- chroma_db_indexer.py +9 -0
- csv_loader.py +9 -0
- embedding_generator.py +16 -0
- question_answerer.py +16 -0
- requirements.txt +5 -0
- run.sh +3 -0
- setup_instructions.md +21 -0
- tapas_application_specification.txt +30 -0
- test_answer_presenter.py +14 -0
- test_application.py +12 -0
- test_chroma_db_indexer.py +14 -0
- test_csv_loader.py +14 -0
- test_embedding_generator.py +9 -0
- test_question_answerer.py +17 -0
README.md
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
-
|
2 |
-
title: TAPASxHF2
|
3 |
-
emoji: π
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: blue
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.26.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Here are the main classes, functions, and methods that will be implemented:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
1. `CSVLoader`: This class will have methods for uploading CSV files and converting them into pandas dataframes with the required structure.
|
4 |
+
2. `EmbeddingGenerator`: This class will have a method for generating embeddings for the data using the SentenceTransformer model.
|
5 |
+
3. `ChromaDBIndexer`: This class will have methods for creating a chroma db index and uploading the generated embeddings to the index.
|
6 |
+
4. `QuestionAnswerer`: This class will have methods for processing user's queries, querying the chroma db for the relevant table, and running the table and query through the TAPAS pipeline for answering.
|
7 |
+
5. `AnswerPresenter`: This class will have methods for presenting the answers returned from TAPAS in the UI and executing any suggested aggregator functions.
|
8 |
+
6. `run.sh`: This is the entrypoint of the program. It will start the application and handle any command line arguments.
|
9 |
+
7. `requirements.txt`: This file will list all the Python package dependencies for the application.
|
10 |
+
|
11 |
+
Now, let's implement the code for each file.
|
12 |
+
|
13 |
+
run.sh
|
answer_presenter.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class AnswerPresenter:
|
2 |
+
def present_answer(self, answer):
|
3 |
+
# Implement the logic to present the answer in the UI
|
4 |
+
pass
|
app.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from csv_loader import CSVLoader
|
3 |
+
from embedding_generator import EmbeddingGenerator
|
4 |
+
from chroma_db_indexer import ChromaDBIndexer
|
5 |
+
from question_answerer import QuestionAnswerer
|
6 |
+
from answer_presenter import AnswerPresenter
|
7 |
+
|
8 |
+
def main():
|
9 |
+
st.title("TAPAS LLM Application")
|
10 |
+
|
11 |
+
csv_loader = CSVLoader()
|
12 |
+
embedding_generator = EmbeddingGenerator()
|
13 |
+
chroma_db_indexer = ChromaDBIndexer()
|
14 |
+
question_answerer = QuestionAnswerer()
|
15 |
+
answer_presenter = AnswerPresenter()
|
16 |
+
|
17 |
+
uploaded_files = st.file_uploader("Upload CSV", type='csv', accept_multiple_files=True)
|
18 |
+
if uploaded_files:
|
19 |
+
dataframes = csv_loader.load_csvs(uploaded_files)
|
20 |
+
embeddings = embedding_generator.generate_embeddings(dataframes)
|
21 |
+
chroma_db_indexer.create_index(embeddings)
|
22 |
+
|
23 |
+
query = st.text_input("Enter your question")
|
24 |
+
if query:
|
25 |
+
table = question_answerer.query_table(query)
|
26 |
+
answer = question_answerer.answer_question(query, table)
|
27 |
+
answer_presenter.present_answer(answer)
|
28 |
+
|
29 |
+
if __name__ == "__main__":
|
30 |
+
main()
|
chroma_db_indexer.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from chroma_db import ChromaDB
|
2 |
+
|
3 |
+
class ChromaDBIndexer:
|
4 |
+
def __init__(self):
|
5 |
+
self.db = ChromaDB()
|
6 |
+
|
7 |
+
def create_index(self, embeddings):
|
8 |
+
for embedding in embeddings:
|
9 |
+
self.db.insert(embedding)
|
csv_loader.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
class CSVLoader:
|
4 |
+
def load_csvs(self, files):
|
5 |
+
dataframes = []
|
6 |
+
for file in files:
|
7 |
+
df = pd.read_csv(file)
|
8 |
+
dataframes.append(df)
|
9 |
+
return dataframes
|
embedding_generator.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModel
|
2 |
+
import torch
|
3 |
+
|
4 |
+
class EmbeddingGenerator:
|
5 |
+
def __init__(self):
|
6 |
+
self.model_name = "deepset/all-mpnet-base-v2-table"
|
7 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
8 |
+
self.model = AutoModel.from_pretrained(self.model_name)
|
9 |
+
|
10 |
+
def generate_embeddings(self, dataframes):
|
11 |
+
embeddings = []
|
12 |
+
for df in dataframes:
|
13 |
+
inputs = self.tokenizer(df.to_string(index=False), return_tensors='pt', truncation=True, padding=True)
|
14 |
+
outputs = self.model(**inputs)
|
15 |
+
embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())
|
16 |
+
return embeddings
|
question_answerer.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import TapasTokenizer, TapasForQuestionAnswering, pipeline
|
2 |
+
|
3 |
+
class QuestionAnswerer:
|
4 |
+
def __init__(self):
|
5 |
+
self.model_name = "google/tapas-large-finetuned-wtq"
|
6 |
+
self.tokenizer = TapasTokenizer.from_pretrained(self.model_name)
|
7 |
+
self.model = TapasForQuestionAnswering.from_pretrained(self.model_name)
|
8 |
+
self.pipe = pipeline("table-question-answering", model=self.model, tokenizer=self.tokenizer)
|
9 |
+
|
10 |
+
def query_table(self, query):
|
11 |
+
# Implement the logic to query the chroma db for the relevant table
|
12 |
+
pass
|
13 |
+
|
14 |
+
def answer_question(self, query, table):
|
15 |
+
inputs = self.pipe(table, query)
|
16 |
+
return inputs
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==0.88.0
|
2 |
+
pandas==1.3.3
|
3 |
+
transformers==4.10.2
|
4 |
+
gradio==2.3.0
|
5 |
+
sentence-transformers==2.0.0
|
run.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Entry point for the TAPAS LLM Application
|
3 |
+
streamlit run app.py
|
setup_instructions.md
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Setup Instructions for the TAPAS LLM Application
|
2 |
+
|
3 |
+
1. Install the necessary dependencies:
|
4 |
+
|
5 |
+
```bash
|
6 |
+
pip install -r requirements.txt
|
7 |
+
```
|
8 |
+
|
9 |
+
2. Setup the required services (e.g., chroma db):
|
10 |
+
|
11 |
+
```bash
|
12 |
+
# Add the appropriate commands here
|
13 |
+
```
|
14 |
+
|
15 |
+
3. Run the application:
|
16 |
+
|
17 |
+
```bash
|
18 |
+
./run.sh
|
19 |
+
```
|
20 |
+
|
21 |
+
If you encounter any issues during setup, please refer to the [Troubleshooting Guide](troubleshooting.md).
|
tapas_application_specification.txt
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Specification for TAPAS LLM Application
|
2 |
+
|
3 |
+
1. Overview:
|
4 |
+
The application will be a web-based application hosted on HuggingFace "Space" using either streamlit or gradio UI. It will allow users to upload CSV files and ask questions about the data in the files. The application will use Google's TAPAS LLM model to answer these questions.
|
5 |
+
|
6 |
+
2. Core Classes and Functions:
|
7 |
+
- `CSVLoader`: This class will handle the uploading and loading of CSV files. It will convert the CSV files into pandas dataframes with the required structure for the SentenceTransformer retriever.
|
8 |
+
- `EmbeddingGenerator`: This class will generate embeddings for the data using the SentenceTransformer model "deepset/all-mpnet-base-v2-table".
|
9 |
+
- `ChromaDBIndexer`: This class will create a chroma db index and upload the generated embeddings to the index.
|
10 |
+
- `QuestionAnswerer`: This class will handle the processing of user's queries, querying the chroma db for the relevant table, and running the table and query through the TAPAS pipeline for answering.
|
11 |
+
- `AnswerPresenter`: This class will handle the presentation of the answers returned from TAPAS in the UI. It will also handle the execution of any suggested aggregator functions.
|
12 |
+
|
13 |
+
3. Non-Standard Dependencies:
|
14 |
+
- pandas: For handling CSV files and data manipulation.
|
15 |
+
- HuggingFace Transformers: For the TAPAS model, tokenizer, pipeline, and SentenceTransformer model.
|
16 |
+
- streamlit or gradio: For the UI of the application.
|
17 |
+
- chroma db: For creating the index and querying for relevant tables.
|
18 |
+
|
19 |
+
4. Detailed Functionality:
|
20 |
+
- The application will start with a UI where users can upload one or more CSV files. The `CSVLoader` will handle the uploading and loading of these files, converting them into pandas dataframes with the required structure.
|
21 |
+
- The `EmbeddingGenerator` will then generate embeddings for the data using the SentenceTransformer model "deepset/all-mpnet-base-v2-table".
|
22 |
+
- The `ChromaDBIndexer` will create a chroma db index with a metric type of "cosine" and a dimension of "768", and upload the generated embeddings to the index.
|
23 |
+
- The user will be able to input a query in natural language. The `QuestionAnswerer` will take this query, generate embeddings for it, and query the chroma db for the relevant table that should contain the answer.
|
24 |
+
- The `QuestionAnswerer` will then run the relevant table and the query through the TAPAS pipeline for answering. The TAPAS model, tokenizer, and pipeline will be loaded from HuggingFace.
|
25 |
+
- The `AnswerPresenter` will take the answer returned from TAPAS and present it neatly in the UI. If the TAPAS answer includes an aggregator, the `AnswerPresenter` will first execute the aggregator function on the cell values before presenting the answer.
|
26 |
+
|
27 |
+
5. Additional Considerations:
|
28 |
+
- Error handling should be implemented to handle cases where the CSV file cannot be loaded, the embeddings cannot be generated, the chroma db index cannot be created, or the TAPAS pipeline cannot answer the question.
|
29 |
+
- The UI should be user-friendly and intuitive, with clear instructions on how to upload CSV files and input queries.
|
30 |
+
- The application should be designed with scalability in mind, as the number of CSV files and queries could potentially be large.
|
test_answer_presenter.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from answer_presenter import AnswerPresenter
|
2 |
+
|
3 |
+
def test_answer_presenter():
|
4 |
+
answer_presenter = AnswerPresenter()
|
5 |
+
|
6 |
+
# Test presenting answer without aggregator
|
7 |
+
answer = {"answer": "Test", "aggregator": "NONE"}
|
8 |
+
presented_answer = answer_presenter.present(answer)
|
9 |
+
assert presented_answer == "Test"
|
10 |
+
|
11 |
+
# Test presenting answer with aggregator
|
12 |
+
answer = {"answer": "SUM > 1, 2", "cells": ["1", "2"], "aggregator": "SUM"}
|
13 |
+
presented_answer = answer_presenter.present(answer)
|
14 |
+
assert presented_answer == "3"
|
test_application.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from application import Application
|
2 |
+
|
3 |
+
def test_application():
|
4 |
+
app = Application()
|
5 |
+
|
6 |
+
# Test uploading CSV file
|
7 |
+
app.upload_csv("test.csv")
|
8 |
+
assert app.csv_loader.loaded_files == ["test.csv"]
|
9 |
+
|
10 |
+
# Test asking question
|
11 |
+
answer = app.ask_question("What is the sum?")
|
12 |
+
assert isinstance(answer, str)
|
test_chroma_db_indexer.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from chroma_db_indexer import ChromaDBIndexer
|
2 |
+
|
3 |
+
def test_chroma_db_indexer():
|
4 |
+
chroma_db_indexer = ChromaDBIndexer()
|
5 |
+
|
6 |
+
# Test creating index
|
7 |
+
index = chroma_db_indexer.create_index()
|
8 |
+
assert index.metric_type == "cosine"
|
9 |
+
assert index.dim == 768
|
10 |
+
|
11 |
+
# Test uploading embeddings
|
12 |
+
embeddings = [0]*768
|
13 |
+
chroma_db_indexer.upload_embeddings(embeddings, index)
|
14 |
+
assert len(index.search(embeddings, 1)) == 1
|
test_csv_loader.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from csv_loader import CSVLoader
|
3 |
+
|
4 |
+
def test_csv_loader():
|
5 |
+
csv_loader = CSVLoader()
|
6 |
+
|
7 |
+
# Test loading a CSV file
|
8 |
+
df = csv_loader.load("test.csv")
|
9 |
+
assert isinstance(df, pd.DataFrame)
|
10 |
+
|
11 |
+
# Test converting to required structure
|
12 |
+
structured_df = csv_loader.convert_to_required_structure(df)
|
13 |
+
assert isinstance(structured_df, pd.DataFrame)
|
14 |
+
assert "\n" in structured_df.to_string(index=False)
|
test_embedding_generator.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from embedding_generator import EmbeddingGenerator
|
2 |
+
|
3 |
+
def test_embedding_generator():
|
4 |
+
embedding_generator = EmbeddingGenerator()
|
5 |
+
|
6 |
+
# Test generating embeddings
|
7 |
+
embeddings = embedding_generator.generate("test query")
|
8 |
+
assert isinstance(embeddings, list)
|
9 |
+
assert len(embeddings) == 768
|
test_question_answerer.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from question_answerer import QuestionAnswerer
|
2 |
+
|
3 |
+
def test_question_answerer():
|
4 |
+
question_answerer = QuestionAnswerer()
|
5 |
+
|
6 |
+
# Test processing query
|
7 |
+
processed_query = question_answerer.process_query("test query")
|
8 |
+
assert isinstance(processed_query, list)
|
9 |
+
|
10 |
+
# Test querying chroma db
|
11 |
+
table = question_answerer.query_chroma_db(processed_query)
|
12 |
+
assert isinstance(table, pd.DataFrame)
|
13 |
+
|
14 |
+
# Test running TAPAS pipeline
|
15 |
+
answer = question_answerer.run_tapas_pipeline(processed_query, table)
|
16 |
+
assert isinstance(answer, dict)
|
17 |
+
assert "answer" in answer
|