ray commited on
Commit
9021b39
1 Parent(s): 61ec090

v2 - manually split knowledge units

Browse files
Files changed (5) hide show
  1. .gitignore +4 -0
  2. app.py +13 -6
  3. chatbot.py +8 -8
  4. custom_io.py +45 -0
  5. scripts/convert_docx_to_md.sh +37 -0
.gitignore CHANGED
@@ -1,3 +1,7 @@
1
  .env
2
  **/__pycache__
3
  awesumcare_data
 
 
 
 
 
1
  .env
2
  **/__pycache__
3
  awesumcare_data
4
+ TestData
5
+ logs
6
+ wandb
7
+ streamlit_chatbot_pack
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import openai
3
  import os
@@ -14,7 +15,7 @@ from llama_index.ingestion import IngestionPipeline
14
  from chat_template import CHAT_TEXT_QA_PROMPT
15
  from schemas import ChatbotVersion, ServiceProvider
16
  from chatbot import Chatbot, IndexBuilder
17
- from custom_io import UnstructuredReader, default_file_metadata_func
18
  from qdrant import client as qdrantClient
19
  from llama_index import set_global_service_context
20
 
@@ -28,11 +29,11 @@ llama_index.set_global_handler("arize_phoenix")
28
  openai.api_key = os.getenv("OPENAI_API_KEY")
29
 
30
  IS_LOAD_FROM_VECTOR_STORE = True
31
- VDB_COLLECTION_NAME = "demo-v0"
32
  MODEL_NAME = ChatbotVersion.CHATGPT_4.value
33
 
34
 
35
- CHUNK_SIZE = 1024
36
  LLM, EMBED_MODEL = get_service_provider_config(
37
  service_provider=ServiceProvider.OPENAI, model_name=MODEL_NAME)
38
  service_context = ServiceContext.from_defaults(
@@ -45,13 +46,19 @@ set_global_service_context(service_context)
45
 
46
  class AwesumIndexBuilder(IndexBuilder):
47
  def _load_doucments(self):
48
- dir_reader = SimpleDirectoryReader('./awesumcare_data', file_extractor={
 
 
 
 
49
  ".pdf": UnstructuredReader(),
50
  ".docx": UnstructuredReader(),
51
  ".pptx": UnstructuredReader(),
 
52
  },
53
  recursive=True,
54
- exclude=["*.png", "*.pptx"],
 
55
  file_metadata=default_file_metadata_func)
56
 
57
  self.documents = dir_reader.load_data()
@@ -73,7 +80,7 @@ class AwesumIndexBuilder(IndexBuilder):
73
  return
74
  pipeline = IngestionPipeline(
75
  transformations=[
76
- SentenceSplitter(),
77
  self.embed_model,
78
  ],
79
  vector_store=self.vector_store,
 
1
+ import glob
2
  import gradio as gr
3
  import openai
4
  import os
 
15
  from chat_template import CHAT_TEXT_QA_PROMPT
16
  from schemas import ChatbotVersion, ServiceProvider
17
  from chatbot import Chatbot, IndexBuilder
18
+ from custom_io import MarkdownReader, UnstructuredReader, default_file_metadata_func
19
  from qdrant import client as qdrantClient
20
  from llama_index import set_global_service_context
21
 
 
29
  openai.api_key = os.getenv("OPENAI_API_KEY")
30
 
31
  IS_LOAD_FROM_VECTOR_STORE = True
32
+ VDB_COLLECTION_NAME = "demo-v1"
33
  MODEL_NAME = ChatbotVersion.CHATGPT_4.value
34
 
35
 
36
+ CHUNK_SIZE = 8191
37
  LLM, EMBED_MODEL = get_service_provider_config(
38
  service_provider=ServiceProvider.OPENAI, model_name=MODEL_NAME)
39
  service_context = ServiceContext.from_defaults(
 
46
 
47
  class AwesumIndexBuilder(IndexBuilder):
48
  def _load_doucments(self):
49
+ directory = "./awesumcare_data/awesumcare_manual_data"
50
+ # all_files = glob.glob(os.path.join(directory, '*.md'))
51
+ # faq_files = [f for f in all_files if 'FAQ' in os.path.basename(f)]
52
+ # print(faq_files)
53
+ dir_reader = SimpleDirectoryReader(directory, file_extractor={
54
  ".pdf": UnstructuredReader(),
55
  ".docx": UnstructuredReader(),
56
  ".pptx": UnstructuredReader(),
57
+ ".md": MarkdownReader()
58
  },
59
  recursive=True,
60
+ # input_files=faq_files,
61
+ exclude=["*.png", "*.pptx", "*.docx", "*.pdf"],
62
  file_metadata=default_file_metadata_func)
63
 
64
  self.documents = dir_reader.load_data()
 
80
  return
81
  pipeline = IngestionPipeline(
82
  transformations=[
83
+ # SentenceSplitter(),
84
  self.embed_model,
85
  ],
86
  vector_store=self.vector_store,
chatbot.py CHANGED
@@ -126,14 +126,14 @@ class Chatbot:
126
  partial_message += token
127
  yield partial_message
128
 
129
- urls = [source.node.metadata.get(
130
- "file_name") for source in response.source_nodes if source.score >= 0.78 and source.node.metadata.get("file_name")]
131
- if urls:
132
- urls = list(set(urls))
133
- url_section = "\n \n\n---\n\n參考: \n" + \
134
- "\n".join(f"- {url}" for url in urls)
135
- partial_message += url_section
136
- yield partial_message
137
 
138
  def convert_to_chat_messages(self, history: List[List[str]]) -> List[ChatMessage]:
139
  chat_messages = [ChatMessage(
 
126
  partial_message += token
127
  yield partial_message
128
 
129
+ # urls = [source.node.metadata.get(
130
+ # "file_name") for source in response.source_nodes if source.score >= 0.78 and source.node.metadata.get("file_name")]
131
+ # if urls:
132
+ # urls = list(set(urls))
133
+ # url_section = "\n \n\n---\n\n參考: \n" + \
134
+ # "\n".join(f"- {url}" for url in urls)
135
+ # partial_message += url_section
136
+ # yield partial_message
137
 
138
  def convert_to_chat_messages(self, history: List[List[str]]) -> List[ChatMessage]:
139
  chat_messages = [ChatMessage(
custom_io.py CHANGED
@@ -50,6 +50,51 @@ class UnstructuredReader(BaseReader):
50
  ]
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def default_file_metadata_func(file_path: str) -> Dict:
54
  """Get some handy metadate from filesystem.
55
 
 
50
  ]
51
 
52
 
53
+ class MarkdownReader(BaseReader):
54
+ """General unstructured text reader for a variety of files."""
55
+
56
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
57
+ """Init params."""
58
+ super().__init__(*args, **kwargs)
59
+
60
+ def load_data(
61
+ self,
62
+ file: Path,
63
+ extra_info: Optional[Dict] = None,
64
+ split_documents: Optional[bool] = True,
65
+ ) -> List[Document]:
66
+ """Parse file."""
67
+ from unstructured.partition.auto import partition
68
+
69
+ elements = parse_knowledge_units(str(file))
70
+
71
+ if split_documents:
72
+ return [
73
+ Document(text=ele, extra_info=extra_info or {})
74
+ for ele in elements
75
+ ]
76
+
77
+ def parse_knowledge_units(file_path):
78
+ with open(file_path, 'r', encoding='utf-8') as file:
79
+ lines = file.readlines()
80
+
81
+ knowledge_units = []
82
+ current_unit = ""
83
+
84
+ for line in lines:
85
+ if line.strip() and line[0].isdigit() and '.' in line:
86
+ if current_unit:
87
+ knowledge_units.append(current_unit.strip())
88
+ current_unit = ""
89
+ current_unit += line
90
+ else:
91
+ current_unit += line
92
+
93
+ if current_unit:
94
+ knowledge_units.append(current_unit.strip())
95
+
96
+ return knowledge_units
97
+
98
  def default_file_metadata_func(file_path: str) -> Dict:
99
  """Get some handy metadate from filesystem.
100
 
scripts/convert_docx_to_md.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Check if a directory path is provided
4
+ if [ "$#" -ne 1 ]; then
5
+ echo "Usage: $0 <directory_path>"
6
+ exit 1
7
+ fi
8
+
9
+ # Get the directory path from the argument
10
+ dir_path=$1
11
+
12
+ # Check if the specified directory exists
13
+ if [ ! -d "$dir_path" ]; then
14
+ echo "Directory does not exist: $dir_path"
15
+ exit 1
16
+ fi
17
+
18
+ # Iterate through all .docx files in the specified directory
19
+ for docx_file in "$dir_path"/*.docx; do
20
+ # Skip if no .docx files are found
21
+ if [ ! -f "$docx_file" ]; then
22
+ continue
23
+ fi
24
+
25
+ # Extract filename without extension
26
+ filename=$(basename -- "$docx_file")
27
+ filename="${filename%.*}"
28
+
29
+ # Define the output Markdown filename
30
+ md_file="${dir_path}/${filename}.md"
31
+
32
+ # Convert the document to Markdown format
33
+ pandoc -t markdown --extract-media="$dir_path" "$docx_file" -o "$md_file"
34
+ echo "Converted: $docx_file to $md_file"
35
+ done
36
+
37
+ echo "Conversion complete."