robkaandorp commited on
Commit
3b327ab
1 Parent(s): c485680

Add create_dataset.py

Browse files

Add query_dataset.py
Add chroma database with chunked website content

.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  **/*.sqlite3 filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  **/*.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
chroma_db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a70637d8d8f17398861cbb834eaca0a14aab6564f7cf8ecb495365dcfb35fcda
3
+ size 48570368
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d89713b56782670ff98432b154b924c55decce1b78b10ada067fae913a881b3f
3
+ size 11732000
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d682d098e967a97e880475a3ac506f4d738b6933a24eea59cdb3a0a86c9eb179
3
+ size 100
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0f886e27dfdf9d61c86f16d9f08cf080c39928e76d39de829af92e5fd1e21a6
3
+ size 404064
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:810bc84d607e5dd8132a9a069b5e53e3385fb324817dd66dbc5e8cf22acf1466
3
+ size 28000
chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04ecb32a3d55d75009bba9964f174c58a35c33095254fc8b971489736d9b593d
3
+ size 60232
create_dataset.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import DirectoryLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from datasets import load_dataset
4
+ from langchain_community.document_loaders import TextLoader
5
+ from langchain_community.embeddings.sentence_transformer import (
6
+ SentenceTransformerEmbeddings,
7
+ )
8
+ from langchain_community.vectorstores import Chroma
9
+ from unstructured.cleaners.core import clean_extra_whitespace
10
+
11
+ html_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.html", show_progress=True)
12
+ pdf_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.pdf", show_progress=True)
13
+
14
+ html_docs = html_loader.load()
15
+ pdf_docs = pdf_loader.load()
16
+
17
+ text_splitter = RecursiveCharacterTextSplitter(
18
+ chunk_size=500,
19
+ chunk_overlap=30,
20
+ length_function=len,
21
+ is_separator_regex=False,
22
+ )
23
+
24
+ texts = []
25
+ texts.extend(text_splitter.split_documents(html_docs))
26
+ texts.extend(text_splitter.split_documents(pdf_docs))
27
+
28
+ # create the open-source embedding function
29
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
30
+
31
+ # load it into Chroma
32
+ db = Chroma.from_documents(texts, embedding_function, persist_directory="./chroma_db")
33
+
34
+ print("There are", db._collection.count(), "in the collection")
query_dataset.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.embeddings.sentence_transformer import (
2
+ SentenceTransformerEmbeddings,
3
+ )
4
+ from langchain_community.vectorstores import Chroma
5
+
6
+ # create the open-source embedding function
7
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
8
+
9
+ # load it into Chroma
10
+ db = Chroma(embedding_function=embedding_function, persist_directory="./chroma_db")
11
+
12
+ print("There are", db._collection.count(), " docs in the collection")
13
+
14
+ queries = [
15
+ "Where is the Nowhere event?",
16
+ "Give me some information about the toilets.",
17
+ "What is consent?",
18
+ ]
19
+
20
+ for query in queries:
21
+ # query it
22
+ docs = db.similarity_search(query)
23
+
24
+ # print results
25
+ print(f"\n\nQuery: {query}")
26
+ print(f"Results: {len(docs)}")
27
+ print(f"First result: {docs[0].page_content}")
28
+ print(f"Second result: {docs[1].page_content}")