robkaandorp
commited on
Commit
•
3b327ab
1
Parent(s):
c485680
Add create_dataset.py
Browse filesAdd query_dataset.py
Add chroma database with chunked website content
- .gitattributes +1 -0
- chroma_db/chroma.sqlite3 +3 -0
- chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/data_level0.bin +3 -0
- chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/header.bin +3 -0
- chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/index_metadata.pickle +3 -0
- chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/length.bin +3 -0
- chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/link_lists.bin +3 -0
- create_dataset.py +34 -0
- query_dataset.py +28 -0
.gitattributes
CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
**/*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
**/*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
chroma_db/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a70637d8d8f17398861cbb834eaca0a14aab6564f7cf8ecb495365dcfb35fcda
|
3 |
+
size 48570368
|
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d89713b56782670ff98432b154b924c55decce1b78b10ada067fae913a881b3f
|
3 |
+
size 11732000
|
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d682d098e967a97e880475a3ac506f4d738b6933a24eea59cdb3a0a86c9eb179
|
3 |
+
size 100
|
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0f886e27dfdf9d61c86f16d9f08cf080c39928e76d39de829af92e5fd1e21a6
|
3 |
+
size 404064
|
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:810bc84d607e5dd8132a9a069b5e53e3385fb324817dd66dbc5e8cf22acf1466
|
3 |
+
size 28000
|
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04ecb32a3d55d75009bba9964f174c58a35c33095254fc8b971489736d9b593d
|
3 |
+
size 60232
|
create_dataset.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import DirectoryLoader
|
2 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
3 |
+
from datasets import load_dataset
|
4 |
+
from langchain_community.document_loaders import TextLoader
|
5 |
+
from langchain_community.embeddings.sentence_transformer import (
|
6 |
+
SentenceTransformerEmbeddings,
|
7 |
+
)
|
8 |
+
from langchain_community.vectorstores import Chroma
|
9 |
+
from unstructured.cleaners.core import clean_extra_whitespace
|
10 |
+
|
11 |
+
html_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.html", show_progress=True)
|
12 |
+
pdf_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.pdf", show_progress=True)
|
13 |
+
|
14 |
+
html_docs = html_loader.load()
|
15 |
+
pdf_docs = pdf_loader.load()
|
16 |
+
|
17 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
18 |
+
chunk_size=500,
|
19 |
+
chunk_overlap=30,
|
20 |
+
length_function=len,
|
21 |
+
is_separator_regex=False,
|
22 |
+
)
|
23 |
+
|
24 |
+
texts = []
|
25 |
+
texts.extend(text_splitter.split_documents(html_docs))
|
26 |
+
texts.extend(text_splitter.split_documents(pdf_docs))
|
27 |
+
|
28 |
+
# create the open-source embedding function
|
29 |
+
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
30 |
+
|
31 |
+
# load it into Chroma
|
32 |
+
db = Chroma.from_documents(texts, embedding_function, persist_directory="./chroma_db")
|
33 |
+
|
34 |
+
print("There are", db._collection.count(), "in the collection")
|
query_dataset.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.embeddings.sentence_transformer import (
|
2 |
+
SentenceTransformerEmbeddings,
|
3 |
+
)
|
4 |
+
from langchain_community.vectorstores import Chroma
|
5 |
+
|
6 |
+
# create the open-source embedding function
|
7 |
+
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
8 |
+
|
9 |
+
# load it into Chroma
|
10 |
+
db = Chroma(embedding_function=embedding_function, persist_directory="./chroma_db")
|
11 |
+
|
12 |
+
print("There are", db._collection.count(), " docs in the collection")
|
13 |
+
|
14 |
+
queries = [
|
15 |
+
"Where is the Nowhere event?",
|
16 |
+
"Give me some information about the toilets.",
|
17 |
+
"What is consent?",
|
18 |
+
]
|
19 |
+
|
20 |
+
for query in queries:
|
21 |
+
# query it
|
22 |
+
docs = db.similarity_search(query)
|
23 |
+
|
24 |
+
# print results
|
25 |
+
print(f"\n\nQuery: {query}")
|
26 |
+
print(f"Results: {len(docs)}")
|
27 |
+
print(f"First result: {docs[0].page_content}")
|
28 |
+
print(f"Second result: {docs[1].page_content}")
|