lgfunderburk commited on
Commit
df56dc3
1 Parent(s): b8bd17b
Files changed (10) hide show
  1. .gitignore +161 -0
  2. Dockerfile +43 -0
  3. README.md +40 -6
  4. __init__.py +0 -0
  5. app.py +31 -0
  6. chainlit.md +25 -0
  7. faissdenseretrieval.py +90 -0
  8. poetry.lock +0 -0
  9. pyproject.toml +26 -0
  10. requirements.txt +0 -0
.gitignore ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+ .DS_Store
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python image as the base image
2
+ FROM python:3.9
3
+
4
+ # Set the working directory to /app
5
+ WORKDIR /app
6
+
7
+ # Create a new user and switch to this user
8
+ RUN useradd -m -u 1000 user
9
+ USER user
10
+
11
+ # Set environment variables for the new user
12
+ ENV HOME=/home/user \
13
+ PATH=/home/user/.local/bin:$PATH
14
+
15
+ # Set the working directory for the user
16
+ WORKDIR $HOME/app
17
+
18
+ # Copy the current directory contents into the container at /app
19
+ COPY --chown=user . $HOME/app
20
+
21
+ # Copy requirements file
22
+ COPY ./requirements.txt ~/app/requirements.txt
23
+
24
+ # Copy poetry configuration files
25
+ COPY pyproject.toml poetry.lock /app/
26
+
27
+ # Upgrade pip and install poetry
28
+ RUN pip install --upgrade pip
29
+ RUN pip install farm-haystack[faiss]
30
+ RUN pip install poetry
31
+
32
+ # Set environment variable to create a virtual environment within the project directory
33
+ ENV POETRY_VIRTUALENVS_IN_PROJECT=true
34
+
35
+ # Lock and install project dependencies using poetry
36
+ RUN poetry lock
37
+ RUN poetry install
38
+
39
+ # Copy the rest of the application code
40
+ COPY . .
41
+
42
+ # Define the command to run the app
43
+ CMD ["poetry", "run", "chainlit", "run", "app.py", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,45 @@
1
  ---
2
- title: Barbie Raq Hf
3
- emoji: 🐠
4
- colorFrom: pink
5
- colorTo: pink
6
  sdk: docker
 
7
  pinned: false
8
- license: openrail
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Haystack-rag-about-Barbie
3
+ emoji: 🚀
4
+ colorFrom: indigo
5
+ colorTo: red
6
  sdk: docker
7
+ app_file: app.py
8
  pinned: false
 
9
  ---
10
 
11
+
12
+ # What do people think about the Barbie (2023) movie?
13
+
14
+ This chatbot can help you identify what people think about the Barbie (2023) movie. You can also ask it information about the movie.
15
+
16
+ ### Mini demo
17
+
18
+ ![](demo.gif)
19
+
20
+ ### App
21
+
22
+ - Code to do web scraping from natural language query is in [faissdenseretrieval.py](faissdenseretrieval.py)
23
+ - Code to run the app is in [app.py](app.py)
24
+
25
+ ### How it is built:
26
+
27
+ The application uses Haystack's WebRetriever class to scrape reviews from the internet. It uses a simple NLP query: "IMDB movie reviews for the Barbie movie (2023)" and 100 top k results were fetched. The results were then stored into a FAISS document store.
28
+
29
+ To retrieve answers I used the DensePassageRetriever class from Haystack using the following models:
30
+
31
+ ```
32
+ query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
33
+ passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
34
+ ```
35
+
36
+ The embeddings were applied onto the documents in the document store.
37
+
38
+ I then initialized a Haystack pipeline whose nodes include a prompt node that uses OpenAI's GPT-4 and the DensePassageRetriever node. Its user interface was built using Chainlit.
39
+
40
+ ### How does it work?
41
+
42
+ 1. The WebRetriever will scrape the internet for reviews of the Barbie movie (2023) based on the natural language query using the SERP API.
43
+ 2. The WebRetriever transforms the results into Document objects which can then be saved into a FAISS document store.
44
+ 3. The DensePassageRetriever` node will apply embeddings to the documents in the document store and then it will use the embeddings to retrieve the top k results for a given query.
45
+ 4. When a user asks a question, the PromptNode will use the top k results to generate an answer using OpenAI's GPT-4.
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chainlit as cl
2
+ from faissdenseretrieval import initialize_documents, initialize_faiss_document_store, initialize_rag_pipeline
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables (if any)
7
+ load_dotenv("../.env")
8
+ load_dotenv()
9
+ serp = os.getenv("SERP_API_KEY")
10
+ openai_key = os.getenv("OPENAI_API_KEY")
11
+
12
+ # Initialize documents
13
+ documents = initialize_documents(serp_key=serp, nl_query="IMDB movie reviews for the Barbie movie (2023)")
14
+
15
+ # Initialize document store and retriever
16
+ document_store, retriever = initialize_faiss_document_store(documents=documents)
17
+
18
+ # Initialize pipeline
19
+ query_pipeline = initialize_rag_pipeline(retriever=retriever, openai_key=openai_key)
20
+
21
+ @cl.on_message
22
+ async def main(message: str):
23
+ # Use the pipeline to get a response
24
+ output = query_pipeline.run(query=message)
25
+
26
+ # Create a Chainlit message with the response
27
+ response = output['answers'][0].answer
28
+ msg = cl.Message(content=response)
29
+
30
+ # Send the message to the user
31
+ await msg.send()
chainlit.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # What do people think about the Barbie (2023) movie?
2
+
3
+ This chatbot can help you identify what people think about the Barbie (2023) movie. You can also ask it information about the movie.
4
+
5
+ ### How it is built:
6
+
7
+ The application uses Haystack's WebRetriever class to scrape reviews from the internet. It uses a simple NLP query: "IMDB movie reviews for the Barbie movie (2023)" and 100 top k results were fetched. The results were then stored into a FAISS document store.
8
+
9
+ To retrieve answers I used the DensePassageRetriever class from Haystack using the following models:
10
+
11
+
12
+ query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
13
+ passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
14
+
15
+
16
+ The embeddings were applied onto the documents in the document store.
17
+
18
+ I then initialized a Haystack pipeline whose nodes include a prompt node that uses OpenAI's GPT-4 and the DensePassageRetriever node. Its user interface was built using Chainlit.
19
+
20
+ ### How does it work?
21
+
22
+ 1. The WebRetriever will scrape the internet for reviews of the Barbie movie (2023) based on the natural language query using the SERP API.
23
+ 2. The WebRetriever transforms the results into Document objects which can then be saved into a FAISS document store.
24
+ 3. The DensePassageRetriever node will apply embeddings to the documents in the document store and then it will use the embeddings to retrieve the top k results for a given query.
25
+ 4. When a user asks a question, the PromptNode will use the top k results to generate an answer using OpenAI's GPT-4.
faissdenseretrieval.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import WebRetriever
2
+ from haystack.schema import Document
3
+ from typing import List
4
+ from haystack.document_stores import FAISSDocumentStore
5
+ from haystack.nodes import AnswerParser, PromptNode, PromptTemplate
6
+ from haystack import Pipeline
7
+ from haystack.nodes import DensePassageRetriever
8
+ import os
9
+ from dotenv import load_dotenv
10
+
11
+ def initialize_documents(serp_key, nl_query):
12
+ """
13
+ Initialize documents retrieved from the SERP API.
14
+
15
+ Args:
16
+ serp_key (str): API key for the SERP API.
17
+ nl_query (str): Natural language query to retrieve documents for.
18
+
19
+ """
20
+ # Initialize WebRetriever
21
+ retriever = WebRetriever(api_key=serp_key,
22
+ mode="preprocessed_documents",
23
+ top_k=100)
24
+
25
+ # Retrieve documents based a natural language query
26
+ documents : List[Document] = retriever.retrieve(query=nl_query)
27
+
28
+ return documents
29
+
30
+ def initialize_faiss_document_store(documents):
31
+ """
32
+ Initialize a FAISS document store and retriever.
33
+
34
+ Args:
35
+ documents (List[Document]): List of documents to be stored in the document store.
36
+
37
+ Returns:
38
+ document_store (FAISSDocumentStore): FAISS document store.
39
+ retriever (DensePassageRetriever): Dense passage retriever.
40
+ """
41
+
42
+ # Initialize document store
43
+ document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)
44
+
45
+ retriever = DensePassageRetriever(
46
+ document_store=document_store,
47
+ query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
48
+ passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
49
+ use_gpu=True,
50
+ embed_title=True,
51
+ )
52
+
53
+ # Delete existing documents in document store
54
+ document_store.delete_documents()
55
+ document_store.write_documents(documents)
56
+
57
+ # Add documents embeddings to index
58
+ document_store.update_embeddings(retriever=retriever)
59
+
60
+ return document_store, retriever
61
+
62
+ def initialize_rag_pipeline(retriever, openai_key):
63
+ """
64
+ Initialize a pipeline for RAG-based question answering.
65
+
66
+ Args:
67
+ retriever (DensePassageRetriever): Dense passage retriever.
68
+ openai_key (str): API key for OpenAI.
69
+
70
+ Returns:
71
+ query_pipeline (Pipeline): Pipeline for RAG-based question answering.
72
+ """
73
+ prompt_template = PromptTemplate(prompt = """"Answer the following query based on the provided context. If the context does
74
+ not include an answer, reply with 'The data does not contain information related to the question'.\n
75
+ Query: {query}\n
76
+ Documents: {join(documents)}
77
+ Answer:
78
+ """,
79
+ output_parser=AnswerParser())
80
+ prompt_node = PromptNode(model_name_or_path = "gpt-4",
81
+ api_key = openai_key,
82
+ default_prompt_template = prompt_template,
83
+ max_length = 500,
84
+ model_kwargs={"stream":True})
85
+
86
+ query_pipeline = Pipeline()
87
+ query_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
88
+ query_pipeline.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])
89
+
90
+ return query_pipeline
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "llmops-with-haystack"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Laura Gutierrez Funderburk <lgutierrwr@gmail.com>"]
6
+ license = "Apache 2.0"
7
+ readme = "README.md"
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.9"
11
+ torch = [
12
+ {url = "https://download.pytorch.org/whl/cpu/torch-1.10.0%2Bcpu-cp39-cp39-linux_x86_64.whl", markers = "sys_platform == 'linux'"},
13
+ ]
14
+ farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
15
+ chainlit = "^0.7.0"
16
+ openai = "^0.28.0"
17
+ jupyter = "^1.0.0"
18
+ ipykernel = "^6.25.2"
19
+ python-dotenv = "^1.0.0"
20
+ datasets = "^2.14.5"
21
+ nltk = "^3.8.1"
22
+
23
+
24
+ [build-system]
25
+ requires = ["poetry-core"]
26
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
The diff for this file is too large to render. See raw diff