Omar Solano commited on
Commit
680fe32
1 Parent(s): 139a897

add langchain documentation

Browse files
data/scraping_scripts/create_vector_stores.py CHANGED
@@ -1,3 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import argparse
2
  import json
3
  import os
@@ -27,6 +53,10 @@ SOURCE_CONFIGS = {
27
  "input_file": "data/openai_cookbooks_data.jsonl",
28
  "db_name": "chroma-db-openai_cookbooks",
29
  },
 
 
 
 
30
  }
31
 
32
 
 
1
+ """
2
+ Vector Store Creation Script
3
+
4
+ Purpose:
5
+ This script processes various data sources (e.g., transformers, peft, trl, llama_index, openai_cookbooks, langchain)
6
+ to create vector stores using Chroma and LlamaIndex. It reads data from JSONL files, creates document embeddings,
7
+ and stores them in persistent Chroma databases for efficient retrieval.
8
+
9
+ Usage:
10
+ python script_name.py <source1> <source2> ...
11
+
12
+ Example:
13
+ python script_name.py transformers peft llama_index
14
+
15
+ The script accepts one or more source names as command-line arguments. Valid source names are:
16
+ transformers, peft, trl, llama_index, openai_cookbooks, langchain
17
+
18
+ For each specified source, the script will:
19
+ 1. Read data from the corresponding JSONL file
20
+ 2. Create document embeddings
21
+ 3. Store the embeddings in a Chroma vector database
22
+ 4. Save a dictionary of documents for future reference
23
+
24
+ Note: Ensure that the input JSONL files are present in the 'data' directory.
25
+ """
26
+
27
  import argparse
28
  import json
29
  import os
 
53
  "input_file": "data/openai_cookbooks_data.jsonl",
54
  "db_name": "chroma-db-openai_cookbooks",
55
  },
56
+ "langchain": {
57
+ "input_file": "data/langchain_data.jsonl",
58
+ "db_name": "chroma-db-langchain",
59
+ },
60
  }
61
 
62
 
data/scraping_scripts/github_to_markdown_ai_docs.py CHANGED
@@ -67,6 +67,11 @@ SOURCE_CONFIGS = {
67
  "repo": "openai-cookbook",
68
  "path": "examples",
69
  },
 
 
 
 
 
70
  }
71
 
72
  # GitHub Personal Access Token (replace with your own token)
 
67
  "repo": "openai-cookbook",
68
  "path": "examples",
69
  },
70
+ "langchain": {
71
+ "owner": "langchain-ai",
72
+ "repo": "langchain",
73
+ "path": "docs/docs",
74
+ },
75
  }
76
 
77
  # GitHub Personal Access Token (replace with your own token)
data/scraping_scripts/process_md_files.py CHANGED
@@ -110,6 +110,18 @@ SOURCE_CONFIGS = {
110
  "included_root_files": [],
111
  "url_extension": ".ipynb",
112
  },
 
 
 
 
 
 
 
 
 
 
 
 
113
  }
114
 
115
 
 
110
  "included_root_files": [],
111
  "url_extension": ".ipynb",
112
  },
113
+ "langchain": {
114
+ "base_url": "https://python.langchain.com/v0.2/docs/",
115
+ "input_directory": "data/langchain_md_files",
116
+ "output_file": "data/langchain_data.jsonl",
117
+ "source_name": "langchain",
118
+ "use_include_list": True,
119
+ "included_dirs": ["how_to", "versions", "turorials", "integrations"],
120
+ "excluded_dirs": [],
121
+ "excluded_root_files": [],
122
+ "included_root_files": ["security.md", "concepts.mdx", "introduction.mdx"],
123
+ "url_extension": "",
124
+ },
125
  }
126
 
127
 
scripts/main.py CHANGED
@@ -10,6 +10,7 @@ from setup import (
10
  AVAILABLE_SOURCES,
11
  AVAILABLE_SOURCES_UI,
12
  CONCURRENCY_COUNT,
 
13
  custom_retriever_llama_index,
14
  custom_retriever_openai_cookbooks,
15
  custom_retriever_peft,
@@ -46,6 +47,11 @@ def update_query_engine_tools(selected_sources):
46
  "openai_cookbooks_info",
47
  """Useful for questions asking about accomplishing common tasks with the OpenAI API. Returns example code and guides stored in Jupyter notebooks, including info about ChatGPT GPT actions, OpenAI Assistants API, and How to fine-tune OpenAI's GPT-4o and GPT-4o-mini models with the OpenAI API.""",
48
  ),
 
 
 
 
 
49
  }
50
 
51
  for source in selected_sources:
 
10
  AVAILABLE_SOURCES,
11
  AVAILABLE_SOURCES_UI,
12
  CONCURRENCY_COUNT,
13
+ custom_retriever_langchain,
14
  custom_retriever_llama_index,
15
  custom_retriever_openai_cookbooks,
16
  custom_retriever_peft,
 
47
  "openai_cookbooks_info",
48
  """Useful for questions asking about accomplishing common tasks with the OpenAI API. Returns example code and guides stored in Jupyter notebooks, including info about ChatGPT GPT actions, OpenAI Assistants API, and How to fine-tune OpenAI's GPT-4o and GPT-4o-mini models with the OpenAI API.""",
49
  ),
50
+ "LangChain Docs": (
51
+ custom_retriever_langchain,
52
+ "langchain_info",
53
+ """Useful for questions asking about the LangChain framework. It is the documentation of the LangChain framework, includes info about building chains, agents, and tools, using memory, prompts, callbacks, etc.""",
54
+ ),
55
  }
56
 
57
  for source in selected_sources:
scripts/setup.py CHANGED
@@ -77,6 +77,10 @@ custom_retriever_openai_cookbooks = setup_database(
77
  "chroma-db-openai_cookbooks",
78
  "document_dict_openai_cookbooks.pkl",
79
  )
 
 
 
 
80
 
81
  # Constants
82
  CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
@@ -88,6 +92,7 @@ AVAILABLE_SOURCES_UI = [
88
  "TRL Docs",
89
  "LlamaIndex Docs",
90
  "OpenAI Cookbooks",
 
91
  # "Towards AI Blog",
92
  # "RAG Course",
93
  ]
@@ -98,6 +103,7 @@ AVAILABLE_SOURCES = [
98
  "trl",
99
  "llama_index",
100
  "openai_cookbooks",
 
101
  # "towards_ai_blog",
102
  # "rag_course",
103
  ]
@@ -114,6 +120,7 @@ __all__ = [
114
  "custom_retriever_trl",
115
  "custom_retriever_llama_index",
116
  "custom_retriever_openai_cookbooks",
 
117
  "CONCURRENCY_COUNT",
118
  "MONGODB_URI",
119
  "AVAILABLE_SOURCES_UI",
 
77
  "chroma-db-openai_cookbooks",
78
  "document_dict_openai_cookbooks.pkl",
79
  )
80
+ custom_retriever_langchain = setup_database(
81
+ "chroma-db-langchain",
82
+ "document_dict_langchain.pkl",
83
+ )
84
 
85
  # Constants
86
  CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
 
92
  "TRL Docs",
93
  "LlamaIndex Docs",
94
  "OpenAI Cookbooks",
95
+ "LangChain Docs",
96
  # "Towards AI Blog",
97
  # "RAG Course",
98
  ]
 
103
  "trl",
104
  "llama_index",
105
  "openai_cookbooks",
106
+ "langchain",
107
  # "towards_ai_blog",
108
  # "rag_course",
109
  ]
 
120
  "custom_retriever_trl",
121
  "custom_retriever_llama_index",
122
  "custom_retriever_openai_cookbooks",
123
+ "custom_retriever_langchain",
124
  "CONCURRENCY_COUNT",
125
  "MONGODB_URI",
126
  "AVAILABLE_SOURCES_UI",