Spaces:

AutoRAG
/

Naive-RAG-chatbot

Running

App Files Files Community

jeffrey commited on Oct 13, 2024

Commit

37c1830

1 Parent(s): 95681ec

init commit

Browse files

Files changed (11) hide show

.gitignore +166 -0
app.py +143 -0
config/init_project_for_pseudo_trial.yaml +11 -0
config/init_project_for_run.yaml +38 -0
packages.txt +4 -0
requirements.txt +1 -0
src/__init__.py +0 -0
src/data/__init__.py +0 -0
src/data/chunk.py +7 -0
src/data/parse.py +16 -0
src/runner.py +96 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+file_cache/
+data/
+init_project_dir/
+!src/data

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import asyncio
+import os.path
+import tempfile
+import uuid
+from typing import List
+import gradio
+import gradio as gr
+import openai
+import pandas as pd
+from autorag.evaluator import Evaluator
+from src.data.chunk import chunk
+from src.data.parse import parse_pdf
+from src.runner import GradioStreamRunner
+root_dir = os.path.dirname(os.path.realpath(__file__))
+pseudo_trial_yaml_path = os.path.join(root_dir, "config", "init_project_for_pseudo_trial.yaml")
+init_run_yaml = os.path.join(root_dir, "config", "init_project_for_run.yaml")
+gradio_runner = None
+# Code for Task 1
+def file_ingest(input_files: List[str], temp_project_dir, progress=gr.Progress()):
+    if os.getenv("OPENAI_API_KEY") is None:
+        return "Please submit your OpenAI API key first."
+    if not input_files:
+        return "Please upload a file first."
+    progress(0.05)
+    # do parse
+    raw_df = parse_pdf(file_lists=input_files)
+    progress(0.3)
+    # do chunk
+    corpus_df = chunk(raw_df, method="recursivecharacter",
+                      lang="en", chunk_size=512, chunk_overlap=128)
+    progress(0.5)
+    asyncio.sleep(0.5)
+    # Logic for button click
+    empty_qa_df = make_empty_qa(corpus_df=corpus_df)
+    with tempfile.TemporaryDirectory() as temp_data_dir:
+        empty_qa_df.to_parquet(os.path.join(temp_data_dir, "empty_qa.parquet"))
+        corpus_df.to_parquet(os.path.join(temp_data_dir, "corpus.parquet"))
+        evaluator = Evaluator(qa_data_path=os.path.join(temp_data_dir, "empty_qa.parquet"),
+                              corpus_data_path=os.path.join(temp_data_dir, "corpus.parquet"),
+                              project_dir=temp_project_dir)
+        evaluator.start_trial(pseudo_trial_yaml_path, skip_validation=True)
+        yield "Setting up"
+        progress(0.9)
+        set_runner(temp_project_dir)
+        progress(1.0)
+        yield "File uploaded complete. You can use it at chatbot now."
+def make_empty_qa(corpus_df: pd.DataFrame):
+    doc_id = corpus_df["doc_id"].iloc[0]
+    return pd.DataFrame({
+        "qid": str(uuid.uuid4()),
+        "query": ["Who is Kai Havertz?"],
+        "retrieval_gt": [[[doc_id]]],
+        "generation_gt": [["Havertz is the greatest footballer."]],
+    })
+def on_submit_openai_key(openai_key):
+    os.environ["OPENAI_API_KEY"] = openai_key
+    # Test openai key
+    try:
+        client = openai.OpenAI()
+        response = client.chat.completions.create(
+            messages=[
+                {"role": "user", "content": "What is the capital of France?"},
+            ],
+            model="gpt-4o-mini",
+            max_tokens=3,
+        )
+        assert isinstance(response.choices[0].message.content, str)
+        gr.Info("OpenAI API key submitted.", duration=3)
+        return "Setting complete."
+    except openai.AuthenticationError as e:
+        gr.Error("OpenAI API key is invalid.", duration=3)
+        return "Not Set"
+    except AssertionError as e:
+        gr.Error("OpenAI server is not working properly.", duration=3)
+        return "Not Set"
+def set_runner(project_dir):
+    runner = GradioStreamRunner.from_yaml(yaml_path=init_run_yaml, project_dir=project_dir)
+    global gradio_runner
+    gradio_runner = runner
+def get_response(message, history):
+    global gradio_runner
+    if gradio_runner is None:
+        gradio.Warning("Please set the AutoRAG server first.")
+        return
+    if os.getenv("OPENAI_API_KEY", None) is None:
+        gradio.Warning("Please submit your OpenAI API key first.")
+        return
+    for output in gradio_runner.stream_run(message):
+        yield output[0]
+# interface one
+with gr.Blocks(theme="earneleh/paris") as demo:
+    with tempfile.TemporaryDirectory() as project_dir:
+        # Define components
+        with gr.Row():
+            with gr.Column(scale=3):
+                textbox = gr.Textbox(label="Please input your OpenAI API key and press Enter.", type="password",
+                         info="You can get your API key from https://platform.openai.com/account/api-keys\n"
+                         "AutoRAG do not store your API key.",
+                                     autofocus=True)
+                api_key_status_box = gr.Textbox(label="OpenAI API status", value="Not Set", interactive=False)
+                gr.Markdown("## Ingest Your Data")
+                file_input = gr.File(label="Upload Files", type="filepath", file_count="multiple")
+                button = gr.Button("Submit file")
+                text_output = gr.Textbox(label="Status update", interactive=False)
+                # Define layout and interactions
+                textbox.submit(on_submit_openai_key, inputs=[textbox], outputs=api_key_status_box)
+                button.click(file_ingest, inputs=[file_input, gr.State(project_dir)], outputs=[text_output])
+            with gr.Column(scale=7):
+                gr.ChatInterface(
+                    get_response, title="This is your Naive RAG Chatbot 🚀", retry_btn=None, undo_btn=None,
+                )
+        gr.Markdown("## Do you like the result?\n\nIf you don't like it, try to optimize it with AutoRAG. Press below button and go to make evaluation data and optimize it. Both on the Huggingface space so you don't need to install anything.")
+        with gr.Row():
+            open_data_creation = gr.Button(value="1️⃣ : Data Creation",
+                                           link="https://huggingface.co/spaces/AutoRAG/AutoRAG-data-creation")
+            open_optimize = gr.Button(value="2️⃣ : Optimize", link="https://www.auto-rag.com/")
+demo.launch(share=False, debug=True)

config/init_project_for_pseudo_trial.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+node_lines:
+- node_line_name: retrieve_node_line
+  nodes:
+    - node_type: retrieval  # Set Retrieval Node
+      strategy:
+        metrics: [retrieval_f1, retrieval_recall]  # Set Retrieval Metrics
+      top_k: 3
+      modules:
+        - module_type: vectordb
+          embedding_model: openai
+        - module_type: bm25

config/init_project_for_run.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+node_lines:
+  - node_line_name: retrieve_node_line
+    nodes:
+      - modules:
+          - module_type: vectordb
+            embedding_model: openai
+            top_k: 5
+        node_type: retrieval
+        strategy:
+          metrics:
+            - retrieval_f1
+            - retrieval_recall
+            - retrieval_precision
+  - node_line_name: post_retrieve_node_line
+    nodes:
+      - modules:
+          - module_type: fstring
+            prompt: "You are the helpful assistant to answer the question. I will give you a context to read. The context can be unrelated to the question.
+            If the context is related, you must answer the question base on the context.
+            If there is no context that relates to the question, you must say that you don't know about the answer.
+            DO NOT MAKE UP THE ANSWER.
+            If you can solve the question with your own knowledge, you can answer the question. But please do not lie or make up the answer without relevant information.
+            Question: {query} \n Context: {retrieved_contents} \n Answer : "
+        node_type: prompt_maker
+        strategy:
+          metrics:
+            - bleu
+            - meteor
+            - rouge
+      - modules:
+          - llm: openai
+            model: gpt-4o-mini
+            module_type: llama_index_llm
+            temperature: 1.0
+        node_type: generator
+        strategy:
+          metrics:
+            - rouge

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gcc
+poppler-utils
+tesseract-ocr
+libssl-dev

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ AutoRAG[parse,ko,ja]>=0.3.5

src/__init__.py ADDED Viewed

File without changes

src/data/__init__.py ADDED Viewed

File without changes

src/data/chunk.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import pandas as pd
+from autorag.data.chunk import langchain_chunk
+def chunk(raw_df: pd.DataFrame, method: str, lang: str = "en", **kwargs) -> pd.DataFrame:
+	corpus_df = langchain_chunk(raw_df, chunk_method=method, add_file_name=lang, **kwargs)
+	return corpus_df

src/data/parse.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import Callable, List
+from autorag.data.parse import langchain_parse
+from autorag.data.parse.base import _add_last_modified_datetime
+from autorag.utils import result_to_dataframe
+@result_to_dataframe(["texts", "path", "page", "last_modified_datetime"])
+def original_parse(fn: Callable, **kwargs):
+	result = fn(**kwargs)
+	result = _add_last_modified_datetime(result)
+	return result
+def parse_pdf(file_lists: List[str], parse_method: str = "pdfminer"):
+	raw_df = original_parse(langchain_parse.__wrapped__, data_path_list=file_lists, parse_method=parse_method)
+	return raw_df

src/runner.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import uuid
+from typing import List, Dict, Optional
+import pandas as pd
+from autorag.deploy import GradioRunner
+from autorag.deploy.api import RetrievedPassage
+from autorag.nodes.generator.base import BaseGenerator
+from autorag.utils import fetch_contents
+empty_retrieved_passage = RetrievedPassage(
+	content="", doc_id="", filepath=None, file_page=None, start_idx=None, end_idx=None
+)
+class GradioStreamRunner(GradioRunner):
+	def __init__(self, config: Dict, project_dir: Optional[str] = None):
+		super().__init__(config, project_dir)
+		data_dir = os.path.join(project_dir, "data")
+		self.corpus_df = pd.read_parquet(
+			os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
+		)
+	def stream_run(self, query: str):
+		previous_result = pd.DataFrame(
+			{
+				"qid": str(uuid.uuid4()),
+				"query": [query],
+				"retrieval_gt": [[]],
+				"generation_gt": [""],
+			}
+		)  # pseudo qa data for execution
+		for module_instance, module_param in zip(
+				self.module_instances, self.module_params
+		):
+			if not isinstance(module_instance, BaseGenerator):
+				new_result = module_instance.pure(
+					previous_result=previous_result, **module_param
+				)
+				duplicated_columns = previous_result.columns.intersection(
+					new_result.columns
+				)
+				drop_previous_result = previous_result.drop(
+					columns=duplicated_columns
+				)
+				previous_result = pd.concat(
+					[drop_previous_result, new_result], axis=1
+				)
+			else:
+				# retrieved_passages = self.extract_retrieve_passage(
+				# 	previous_result
+				# )
+				# yield "", retrieved_passages
+				# Start streaming of the result
+				assert len(previous_result) == 1
+				prompt: str = previous_result["prompts"].tolist()[0]
+				for delta in module_instance.stream(prompt=prompt, **module_param):
+					yield delta, [empty_retrieved_passage]
+	def extract_retrieve_passage(self, df: pd.DataFrame) -> List[RetrievedPassage]:
+		retrieved_ids: List[str] = df["retrieved_ids"].tolist()[0]
+		contents = fetch_contents(self.corpus_df, [retrieved_ids])[0]
+		if "path" in self.corpus_df.columns:
+			paths = fetch_contents(self.corpus_df, [retrieved_ids], column_name="path")[
+				0
+			]
+		else:
+			paths = [None] * len(retrieved_ids)
+		metadatas = fetch_contents(
+			self.corpus_df, [retrieved_ids], column_name="metadata"
+		)[0]
+		if "start_end_idx" in self.corpus_df.columns:
+			start_end_indices = fetch_contents(
+				self.corpus_df, [retrieved_ids], column_name="start_end_idx"
+			)[0]
+		else:
+			start_end_indices = [None] * len(retrieved_ids)
+		return list(
+			map(
+				lambda content, doc_id, path, metadata, start_end_idx: RetrievedPassage(
+					content=content,
+					doc_id=doc_id,
+					filepath=path,
+					file_page=metadata.get("page", None),
+					start_idx=start_end_idx[0] if start_end_idx else None,
+					end_idx=start_end_idx[1] if start_end_idx else None,
+				),
+				contents,
+				retrieved_ids,
+				paths,
+				metadatas,
+				start_end_indices,
+			)
+		)