Mustehson commited on
Commit
2833068
·
1 Parent(s): e368b39

Created Duckdb Vector Store

Browse files
Files changed (3) hide show
  1. __pycache__/app.cpython-311.pyc +0 -0
  2. app.py +32 -3
  3. requirements.txt +3 -1
__pycache__/app.cpython-311.pyc DELETED
Binary file (9.85 kB)
 
app.py CHANGED
@@ -1,7 +1,10 @@
1
  import re
 
 
2
  import gradio as gr
3
  from io import StringIO
4
- import pandas as pd
 
5
  from langchain_community.document_loaders import RecursiveUrlLoader
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_community.document_transformers import Html2TextTransformer
@@ -9,6 +12,24 @@ from langchain_community.document_transformers import Html2TextTransformer
9
 
10
  TAB_LINES = 22
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def html_only_metadata_extractor(raw_html, url, response):
13
  content_type = response.headers.get("Content-Type", "")
14
  if "text/html" in content_type:
@@ -90,6 +111,10 @@ def concat_dfs(df_list):
90
  concatenated_df = pd.concat(df_list, ignore_index=True)
91
  return concatenated_df
92
 
 
 
 
 
93
 
94
  def get_docs(url, max_depth):
95
  raw_html = scrape_text(url, max_depth)
@@ -108,8 +133,10 @@ def get_docs(url, max_depth):
108
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
109
  documents_splits = text_splitter.split_documents(clean_docs)
110
  formatted_chunks = format_chunks_with_spaces(documents_splits)
 
 
111
 
112
- return format_page_content(raw_html), format_page_content(clean_docs), concat_tables, format_metdata(raw_html), formatted_chunks
113
 
114
 
115
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
@@ -147,9 +174,11 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
147
  with gr.Tab("Metadata"):
148
  metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
149
  autoscroll=False)
 
 
150
 
151
  scarpe_url_button.click(get_docs, inputs=[url_input, max_depth], outputs=[raw_page_content, page_content, tables,
152
- metadata, parsed_chunks])
153
 
154
 
155
  if __name__ == "__main__":
 
1
  import re
2
+ import duckdb
3
+ import pandas as pd
4
  import gradio as gr
5
  from io import StringIO
6
+ from langchain_community.vectorstores.duckdb import DuckDB
7
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
8
  from langchain_community.document_loaders import RecursiveUrlLoader
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
  from langchain_community.document_transformers import Html2TextTransformer
 
12
 
13
  TAB_LINES = 22
14
 
15
+ # Embedding Model args
16
+ model_name = "BAAI/bge-small-en-v1.5"
17
+ model_kwargs = {'device': 'cpu'}
18
+ encode_kwargs = {'normalize_embeddings': True}
19
+
20
+ # HuggingFace Embeddings
21
+ hf = HuggingFaceBgeEmbeddings(
22
+ model_name=model_name,
23
+ model_kwargs=model_kwargs,
24
+ encode_kwargs=encode_kwargs
25
+ )
26
+
27
+ # DuckDB Connection
28
+ con = duckdb.connect('Collections.duckdb')
29
+
30
+ # DuckDB Vector Store
31
+ vector_store = DuckDB(connection = con, embedding=hf)
32
+
33
  def html_only_metadata_extractor(raw_html, url, response):
34
  content_type = response.headers.get("Content-Type", "")
35
  if "text/html" in content_type:
 
111
  concatenated_df = pd.concat(df_list, ignore_index=True)
112
  return concatenated_df
113
 
114
+ def create_embeddings(docs):
115
+ ids = vector_store.add_documents(docs)
116
+ result = con.execute(f"SELECT * FROM embeddings").fetchdf()
117
+ return result[result['id'].isin(ids)]
118
 
119
  def get_docs(url, max_depth):
120
  raw_html = scrape_text(url, max_depth)
 
133
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
134
  documents_splits = text_splitter.split_documents(clean_docs)
135
  formatted_chunks = format_chunks_with_spaces(documents_splits)
136
+ embeddings = create_embeddings(documents_splits)
137
+
138
 
139
+ return format_page_content(raw_html), format_page_content(clean_docs), concat_tables, format_metdata(raw_html), formatted_chunks, embeddings
140
 
141
 
142
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
 
174
  with gr.Tab("Metadata"):
175
  metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
176
  autoscroll=False)
177
+ with gr.Tab("Embeddings"):
178
+ embeddings = gr.Dataframe(label="Vector Store", interactive=False)
179
 
180
  scarpe_url_button.click(get_docs, inputs=[url_input, max_depth], outputs=[raw_page_content, page_content, tables,
181
+ metadata, parsed_chunks, embeddings])
182
 
183
 
184
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -5,4 +5,6 @@ langchain-text-splitters
5
  html2text
6
  lxml
7
  beautifulsoup4
8
- html5lib
 
 
 
5
  html2text
6
  lxml
7
  beautifulsoup4
8
+ html5lib
9
+ duckdb
10
+ sentence_transformers