Spaces:
Runtime error
Runtime error
Update code
Browse files- ingest.py +10 -6
- ingest_examples.py +7 -2
ingest.py
CHANGED
@@ -6,7 +6,7 @@ from markdown import markdown
|
|
6 |
import pickle
|
7 |
from bs4 import BeautifulSoup
|
8 |
from langchain.text_splitter import CharacterTextSplitter
|
9 |
-
from langchain.embeddings import
|
10 |
from langchain.vectorstores import FAISS
|
11 |
from InstructorEmbedding import INSTRUCTOR
|
12 |
|
@@ -25,22 +25,26 @@ for p in Path("docs").rglob("*"):
|
|
25 |
continue
|
26 |
if str(p).lower().endswith(('.md', '.mdx')):
|
27 |
with open(p) as f:
|
28 |
-
print(p)
|
29 |
filename = os.path.splitext(p)[0]
|
30 |
docs.append(clean_data(f.read()))
|
31 |
-
|
|
|
|
|
32 |
|
33 |
text_splitter = CharacterTextSplitter(
|
34 |
separator="\n",
|
35 |
-
chunk_size=
|
36 |
-
chunk_overlap=
|
37 |
length_function=len,
|
38 |
)
|
39 |
|
40 |
documents = text_splitter.create_documents(docs, metadatas=metadatas)
|
41 |
|
42 |
print("making embedding")
|
43 |
-
|
|
|
|
|
|
|
44 |
|
45 |
print("beginning construction of faiss")
|
46 |
search_index = FAISS.from_documents(documents, embedding)
|
|
|
6 |
import pickle
|
7 |
from bs4 import BeautifulSoup
|
8 |
from langchain.text_splitter import CharacterTextSplitter
|
9 |
+
from langchain.embeddings import HuggingFaceInstructEmbeddings, OpenAIEmbeddings
|
10 |
from langchain.vectorstores import FAISS
|
11 |
from InstructorEmbedding import INSTRUCTOR
|
12 |
|
|
|
25 |
continue
|
26 |
if str(p).lower().endswith(('.md', '.mdx')):
|
27 |
with open(p) as f:
|
|
|
28 |
filename = os.path.splitext(p)[0]
|
29 |
docs.append(clean_data(f.read()))
|
30 |
+
newfile_name = filename.replace("\\", "/")[5:]
|
31 |
+
print("file:" + newfile_name)
|
32 |
+
metadatas.append({"source": newfile_name})
|
33 |
|
34 |
text_splitter = CharacterTextSplitter(
|
35 |
separator="\n",
|
36 |
+
chunk_size=768,
|
37 |
+
chunk_overlap=128,
|
38 |
length_function=len,
|
39 |
)
|
40 |
|
41 |
documents = text_splitter.create_documents(docs, metadatas=metadatas)
|
42 |
|
43 |
print("making embedding")
|
44 |
+
model_name = "hkunlp/instructor-large"
|
45 |
+
embed_instruction = "Represent the text from the Hugging Face code documentation"
|
46 |
+
query_instruction = "Query the most relevant text from the Hugging Face code documentation"
|
47 |
+
embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)
|
48 |
|
49 |
print("beginning construction of faiss")
|
50 |
search_index = FAISS.from_documents(documents, embedding)
|
ingest_examples.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
from pathlib import Path
|
4 |
import pickle
|
5 |
from langchain.vectorstores import FAISS
|
6 |
-
from langchain.embeddings import
|
7 |
from langchain.text_splitter import CharacterTextSplitter
|
8 |
from langchain.prompts.example_selector import \
|
9 |
SemanticSimilarityExampleSelector
|
@@ -41,11 +41,16 @@ rephrase_documents = [
|
|
41 |
}
|
42 |
]
|
43 |
|
|
|
|
|
|
|
|
|
|
|
44 |
example_selector = SemanticSimilarityExampleSelector.from_examples(
|
45 |
# This is the list of examples available to select from.
|
46 |
rephrase_documents,
|
47 |
# This is the embedding class used to produce embeddings which are used to measure semantic similarity.
|
48 |
-
|
49 |
# This is the VectorStore class that is used to store the embeddings and do a similarity search over.
|
50 |
FAISS,
|
51 |
# This is the number of examples to produce.
|
|
|
3 |
from pathlib import Path
|
4 |
import pickle
|
5 |
from langchain.vectorstores import FAISS
|
6 |
+
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
7 |
from langchain.text_splitter import CharacterTextSplitter
|
8 |
from langchain.prompts.example_selector import \
|
9 |
SemanticSimilarityExampleSelector
|
|
|
41 |
}
|
42 |
]
|
43 |
|
44 |
+
model_name = "hkunlp/instructor-large"
|
45 |
+
embed_instruction = "Represent the text from the Hugging Face code documentation"
|
46 |
+
query_instruction = "Query the most relevant text from the Hugging Face code documentation"
|
47 |
+
embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)
|
48 |
+
|
49 |
example_selector = SemanticSimilarityExampleSelector.from_examples(
|
50 |
# This is the list of examples available to select from.
|
51 |
rephrase_documents,
|
52 |
# This is the embedding class used to produce embeddings which are used to measure semantic similarity.
|
53 |
+
embedding,
|
54 |
# This is the VectorStore class that is used to store the embeddings and do a similarity search over.
|
55 |
FAISS,
|
56 |
# This is the number of examples to produce.
|