terapyon commited on
Commit
9bc4a6c
1 Parent(s): 6ab28e5

dev/add-metadata-url (#1)

Browse files

- update metadata, url and category (f35d932cb20a039ab283571c1ec0c97663948246)

Files changed (3) hide show
  1. app.py +10 -14
  2. config.py +13 -0
  3. store.py +19 -3
app.py CHANGED
@@ -5,7 +5,7 @@ from langchain.llms import OpenAI
5
  from langchain.vectorstores import Qdrant
6
  from openai.error import InvalidRequestError
7
  from qdrant_client import QdrantClient
8
- from config import get_db_config
9
 
10
 
11
  PERSIST_DIR_NAME = "nvdajp-book"
@@ -13,7 +13,7 @@ PERSIST_DIR_NAME = "nvdajp-book"
13
 
14
  def get_retrieval_qa() -> RetrievalQA:
15
  embeddings = OpenAIEmbeddings()
16
- db_url, db_api_key, db_collection_name = get_db_config()
17
  client = QdrantClient(url=db_url, api_key=db_api_key)
18
  db = Qdrant(client=client, collection_name=db_collection_name, embeddings=embeddings)
19
  retriever = db.as_retriever()
@@ -22,21 +22,17 @@ def get_retrieval_qa() -> RetrievalQA:
22
  )
23
 
24
 
25
- def _remove_prefix_path(p: str):
26
- prefix = "data/rtdocs/nvdajp-book.readthedocs.io/"
27
- return p.removeprefix(prefix)
28
-
29
-
30
  def get_related_url(metadata):
31
- path = set()
32
- url = "https://nvdajp-book.readthedocs.io/"
33
  for m in metadata:
34
- p = m['source']
35
- pathname = _remove_prefix_path(p)
36
- if pathname in path:
37
  continue
38
- path.add(pathname)
39
- yield f'<p>url: <a href="{url}{pathname}">{pathname}</a></p>'
 
 
40
 
41
 
42
  def main(query: str):
 
5
  from langchain.vectorstores import Qdrant
6
  from openai.error import InvalidRequestError
7
  from qdrant_client import QdrantClient
8
+ from config import DB_CONFIG
9
 
10
 
11
  PERSIST_DIR_NAME = "nvdajp-book"
 
13
 
14
  def get_retrieval_qa() -> RetrievalQA:
15
  embeddings = OpenAIEmbeddings()
16
+ db_url, db_api_key, db_collection_name = DB_CONFIG
17
  client = QdrantClient(url=db_url, api_key=db_api_key)
18
  db = Qdrant(client=client, collection_name=db_collection_name, embeddings=embeddings)
19
  retriever = db.as_retriever()
 
22
  )
23
 
24
 
 
 
 
 
 
25
  def get_related_url(metadata):
26
+ urls = set()
 
27
  for m in metadata:
28
+ # p = m['source']
29
+ url = m["url"]
30
+ if url in urls:
31
  continue
32
+ urls.add(url)
33
+ category = m["category"]
34
+ # print(m)
35
+ yield f'<p>URL: <a href="{url}">{url}</a> (category: {category})</p>'
36
 
37
 
38
  def main(query: str):
config.py CHANGED
@@ -1,8 +1,21 @@
1
  import os
2
 
3
 
 
 
 
4
  def get_db_config():
5
  url = os.environ["QDRANT_URL"]
6
  api_key = os.environ["QDRANT_API_KEY"]
7
  collection_name = "nvdajp-book"
8
  return url, api_key, collection_name
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
 
3
 
4
+ SAAS = True
5
+
6
+
7
  def get_db_config():
8
  url = os.environ["QDRANT_URL"]
9
  api_key = os.environ["QDRANT_API_KEY"]
10
  collection_name = "nvdajp-book"
11
  return url, api_key, collection_name
12
+
13
+
14
+ def get_local_db_congin():
15
+ url = "localhost"
16
+ # api_key = os.environ["QDRANT_API_KEY"]
17
+ collection_name = "nvdajp-book"
18
+ return url, None, collection_name
19
+
20
+
21
+ DB_CONFIG = get_db_config() if SAAS else get_local_db_congin()
store.py CHANGED
@@ -3,16 +3,29 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from langchain.embeddings import OpenAIEmbeddings
4
  from langchain.vectorstores import Qdrant
5
  # from qdrant_client import QdrantClient
6
- from config import get_db_config
7
 
8
 
9
  CHUNK_SIZE = 500
10
 
11
 
 
 
 
 
 
12
  def get_documents(path: str):
13
  loader = ReadTheDocsLoader(path, encoding="utf-8")
14
  docs = loader.load()
15
- return docs
 
 
 
 
 
 
 
 
16
 
17
 
18
  def get_text_chunk(docs):
@@ -23,7 +36,7 @@ def get_text_chunk(docs):
23
 
24
  def store(texts):
25
  embeddings = OpenAIEmbeddings()
26
- db_url, db_api_key, db_collection_name = get_db_config()
27
  # client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True)
28
  _ = Qdrant.from_documents(
29
  texts,
@@ -48,6 +61,9 @@ if __name__ == "__main__":
48
  args = sys.argv
49
  if len(args) != 2:
50
  print("No args, you need two args for html_path")
 
 
 
51
  else:
52
  path = args[1]
53
  # dir_name = args[2]
 
3
  from langchain.embeddings import OpenAIEmbeddings
4
  from langchain.vectorstores import Qdrant
5
  # from qdrant_client import QdrantClient
6
+ from config import DB_CONFIG
7
 
8
 
9
  CHUNK_SIZE = 500
10
 
11
 
12
+ def _remove_prefix_path(p: str):
13
+ prefix = "data/rtdocs/nvdajp-book.readthedocs.io/"
14
+ return p.removeprefix(prefix)
15
+
16
+
17
  def get_documents(path: str):
18
  loader = ReadTheDocsLoader(path, encoding="utf-8")
19
  docs = loader.load()
20
+ base_url = "https://nvdajp-book.readthedocs.io/"
21
+ add_meta = {"category": "ja-book"}
22
+ for doc in docs:
23
+ org_metadata = doc.metadata
24
+ source = _remove_prefix_path(org_metadata["source"])
25
+ add_meta = {"category": "ja-book", "source": source, "url": f"{base_url}{source}"}
26
+ doc.metadata = org_metadata | add_meta
27
+ yield doc
28
+ # return docs
29
 
30
 
31
  def get_text_chunk(docs):
 
36
 
37
  def store(texts):
38
  embeddings = OpenAIEmbeddings()
39
+ db_url, db_api_key, db_collection_name = DB_CONFIG
40
  # client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True)
41
  _ = Qdrant.from_documents(
42
  texts,
 
61
  args = sys.argv
62
  if len(args) != 2:
63
  print("No args, you need two args for html_path")
64
+ docs = get_documents("data/rtdocs/nvdajp-book.readthedocs.io/ja/latest")
65
+ print(type(docs))
66
+ breakpoint()
67
  else:
68
  path = args[1]
69
  # dir_name = args[2]