Add create_dataset.py

Add query_dataset.py
Add chroma database with chunked website content

Files changed (9) hide show

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 **/*.sqlite3 filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 **/*.sqlite3 filter=lfs diff=lfs merge=lfs -text
+*.sqlite3 filter=lfs diff=lfs merge=lfs -text

chroma_db/chroma.sqlite3 ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:a70637d8d8f17398861cbb834eaca0a14aab6564f7cf8ecb495365dcfb35fcda
+size 48570368

chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/data_level0.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:d89713b56782670ff98432b154b924c55decce1b78b10ada067fae913a881b3f
+size 11732000

chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/header.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:d682d098e967a97e880475a3ac506f4d738b6933a24eea59cdb3a0a86c9eb179
+size 100

chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/index_metadata.pickle ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f886e27dfdf9d61c86f16d9f08cf080c39928e76d39de829af92e5fd1e21a6
+size 404064

chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/length.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:810bc84d607e5dd8132a9a069b5e53e3385fb324817dd66dbc5e8cf22acf1466
+size 28000

chroma_db/efd5d06e-e95a-4a28-9283-da32938f0c75/link_lists.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:04ecb32a3d55d75009bba9964f174c58a35c33095254fc8b971489736d9b593d
+size 60232

create_dataset.py ADDED Viewed

+from langchain_community.document_loaders import DirectoryLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from datasets import load_dataset
+from langchain_community.document_loaders import TextLoader
+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
+from langchain_community.vectorstores import Chroma
+from unstructured.cleaners.core import clean_extra_whitespace
+html_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.html", show_progress=True)
+pdf_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.pdf", show_progress=True)
+html_docs = html_loader.load()
+pdf_docs = pdf_loader.load()
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=500,
+    chunk_overlap=30,
+    length_function=len,
+    is_separator_regex=False,
+)
+texts = []
+texts.extend(text_splitter.split_documents(html_docs))
+texts.extend(text_splitter.split_documents(pdf_docs))
+# create the open-source embedding function
+embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+# load it into Chroma
+db = Chroma.from_documents(texts, embedding_function, persist_directory="./chroma_db")
+print("There are", db._collection.count(), "in the collection")

query_dataset.py ADDED Viewed

+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
+from langchain_community.vectorstores import Chroma
+# create the open-source embedding function
+embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+# load it into Chroma
+db = Chroma(embedding_function=embedding_function, persist_directory="./chroma_db")
+print("There are", db._collection.count(), " docs in the collection")
+queries = [
+  "Where is the Nowhere event?",
+  "Give me some information about the toilets.",
+  "What is consent?",
+]
+for query in queries:
+  # query it
+  docs = db.similarity_search(query)
+  # print results
+  print(f"\n\nQuery: {query}")
+  print(f"Results: {len(docs)}")
+  print(f"First result: {docs[0].page_content}")
+  print(f"Second result: {docs[1].page_content}")