Spaces:

docpro
/

AWEsumCare-Demo

Running

App Files Files Community

ray commited on Dec 7, 2023

Commit

9021b39

•

1 Parent(s): 61ec090

v2 - manually split knowledge units

Browse files

Files changed (5) hide show

.gitignore +4 -0
app.py +13 -6
chatbot.py +8 -8
custom_io.py +45 -0
scripts/convert_docx_to_md.sh +37 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,7 @@
 .env
 **/__pycache__
 awesumcare_data

 .env
 **/__pycache__
 awesumcare_data
+TestData
+logs
+wandb
+streamlit_chatbot_pack

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import openai
 import os
@@ -14,7 +15,7 @@ from llama_index.ingestion import IngestionPipeline
 from chat_template import CHAT_TEXT_QA_PROMPT
 from schemas import ChatbotVersion, ServiceProvider
 from chatbot import Chatbot, IndexBuilder
-from custom_io import UnstructuredReader, default_file_metadata_func
 from qdrant import client as qdrantClient
 from llama_index import set_global_service_context
@@ -28,11 +29,11 @@ llama_index.set_global_handler("arize_phoenix")
 openai.api_key = os.getenv("OPENAI_API_KEY")
 IS_LOAD_FROM_VECTOR_STORE = True
-VDB_COLLECTION_NAME = "demo-v0"
 MODEL_NAME = ChatbotVersion.CHATGPT_4.value
-CHUNK_SIZE = 1024
 LLM, EMBED_MODEL = get_service_provider_config(
     service_provider=ServiceProvider.OPENAI, model_name=MODEL_NAME)
 service_context = ServiceContext.from_defaults(
@@ -45,13 +46,19 @@ set_global_service_context(service_context)
 class AwesumIndexBuilder(IndexBuilder):
     def _load_doucments(self):
-        dir_reader = SimpleDirectoryReader('./awesumcare_data', file_extractor={
             ".pdf": UnstructuredReader(),
             ".docx": UnstructuredReader(),
             ".pptx": UnstructuredReader(),
         },
             recursive=True,
-            exclude=["*.png", "*.pptx"],
             file_metadata=default_file_metadata_func)
         self.documents = dir_reader.load_data()
@@ -73,7 +80,7 @@ class AwesumIndexBuilder(IndexBuilder):
             return
         pipeline = IngestionPipeline(
             transformations=[
-                SentenceSplitter(),
                 self.embed_model,
             ],
             vector_store=self.vector_store,

+import glob
 import gradio as gr
 import openai
 import os
 from chat_template import CHAT_TEXT_QA_PROMPT
 from schemas import ChatbotVersion, ServiceProvider
 from chatbot import Chatbot, IndexBuilder
+from custom_io import MarkdownReader, UnstructuredReader, default_file_metadata_func
 from qdrant import client as qdrantClient
 from llama_index import set_global_service_context
 openai.api_key = os.getenv("OPENAI_API_KEY")
 IS_LOAD_FROM_VECTOR_STORE = True
+VDB_COLLECTION_NAME = "demo-v1"
 MODEL_NAME = ChatbotVersion.CHATGPT_4.value
+CHUNK_SIZE = 8191
 LLM, EMBED_MODEL = get_service_provider_config(
     service_provider=ServiceProvider.OPENAI, model_name=MODEL_NAME)
 service_context = ServiceContext.from_defaults(
 class AwesumIndexBuilder(IndexBuilder):
     def _load_doucments(self):
+        directory = "./awesumcare_data/awesumcare_manual_data"
+        # all_files = glob.glob(os.path.join(directory, '*.md'))
+        # faq_files = [f for f in all_files if 'FAQ' in os.path.basename(f)]
+        # print(faq_files)
+        dir_reader = SimpleDirectoryReader(directory, file_extractor={
             ".pdf": UnstructuredReader(),
             ".docx": UnstructuredReader(),
             ".pptx": UnstructuredReader(),
+            ".md": MarkdownReader()
         },
             recursive=True,
+            # input_files=faq_files,
+            exclude=["*.png", "*.pptx", "*.docx", "*.pdf"],
             file_metadata=default_file_metadata_func)
         self.documents = dir_reader.load_data()
             return
         pipeline = IngestionPipeline(
             transformations=[
+                # SentenceSplitter(),
                 self.embed_model,
             ],
             vector_store=self.vector_store,

chatbot.py CHANGED Viewed

@@ -126,14 +126,14 @@ class Chatbot:
             partial_message += token
             yield partial_message
-        urls = [source.node.metadata.get(
-            "file_name") for source in response.source_nodes if source.score >= 0.78 and source.node.metadata.get("file_name")]
-        if urls:
-            urls = list(set(urls))
-            url_section = "\n&nbsp;\n\n---\n\n參考: \n" + \
-                "\n".join(f"- {url}" for url in urls)
-            partial_message += url_section
-            yield partial_message
     def convert_to_chat_messages(self, history: List[List[str]]) -> List[ChatMessage]:
         chat_messages = [ChatMessage(

             partial_message += token
             yield partial_message
+        # urls = [source.node.metadata.get(
+        #     "file_name") for source in response.source_nodes if source.score >= 0.78 and source.node.metadata.get("file_name")]
+        # if urls:
+        #     urls = list(set(urls))
+        #     url_section = "\n&nbsp;\n\n---\n\n參考: \n" + \
+        #         "\n".join(f"- {url}" for url in urls)
+        #     partial_message += url_section
+        #     yield partial_message
     def convert_to_chat_messages(self, history: List[List[str]]) -> List[ChatMessage]:
         chat_messages = [ChatMessage(

custom_io.py CHANGED Viewed

@@ -50,6 +50,51 @@ class UnstructuredReader(BaseReader):
             ]
 def default_file_metadata_func(file_path: str) -> Dict:
     """Get some handy metadate from filesystem.

             ]
+class MarkdownReader(BaseReader):
+    """General unstructured text reader for a variety of files."""
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+    def load_data(
+        self,
+        file: Path,
+        extra_info: Optional[Dict] = None,
+        split_documents: Optional[bool] = True,
+    ) -> List[Document]:
+        """Parse file."""
+        from unstructured.partition.auto import partition
+        elements = parse_knowledge_units(str(file))
+        if split_documents:
+            return [
+                Document(text=ele, extra_info=extra_info or {})
+                for ele in elements
+            ]
+def parse_knowledge_units(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+    knowledge_units = []
+    current_unit = ""
+    for line in lines:
+        if line.strip() and line[0].isdigit() and '.' in line:
+            if current_unit:
+                knowledge_units.append(current_unit.strip())
+                current_unit = ""
+            current_unit += line
+        else:
+            current_unit += line
+    if current_unit:
+        knowledge_units.append(current_unit.strip())
+    return knowledge_units
 def default_file_metadata_func(file_path: str) -> Dict:
     """Get some handy metadate from filesystem.

scripts/convert_docx_to_md.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/bin/bash
+# Check if a directory path is provided
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <directory_path>"
+    exit 1
+fi
+# Get the directory path from the argument
+dir_path=$1
+# Check if the specified directory exists
+if [ ! -d "$dir_path" ]; then
+    echo "Directory does not exist: $dir_path"
+    exit 1
+fi
+# Iterate through all .docx files in the specified directory
+for docx_file in "$dir_path"/*.docx; do
+    # Skip if no .docx files are found
+    if [ ! -f "$docx_file" ]; then
+        continue
+    fi
+    # Extract filename without extension
+    filename=$(basename -- "$docx_file")
+    filename="${filename%.*}"
+    # Define the output Markdown filename
+    md_file="${dir_path}/${filename}.md"
+    # Convert the document to Markdown format
+    pandoc -t markdown --extract-media="$dir_path" "$docx_file" -o "$md_file"
+    echo "Converted: $docx_file to $md_file"
+done
+echo "Conversion complete."