Omar Solano commited on
Commit
e9199c3
1 Parent(s): bb3039a

add langchain docs (#39)

Browse files

* add langchain docs

* fix empty source completion

* reorder and rename sources

* bugfix update langchain source name

* new dataset name with renamed sources

* remove tmp file

Files changed (3) hide show
  1. app.py +20 -34
  2. cfg.py +1 -1
  3. data/tmp.py +0 -21
app.py CHANGED
@@ -18,36 +18,38 @@ from gradio.themes.utils import (
18
  import cfg
19
  from cfg import setup_buster
20
 
21
- buster = setup_buster(cfg.buster_cfg)
22
-
23
- # suppress httpx logs they are spammy and uninformative
24
- logging.getLogger("httpx").setLevel(logging.WARNING)
25
-
26
- logger = logging.getLogger(__name__)
27
- logging.basicConfig(level=logging.INFO)
28
-
29
  CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
30
 
31
  AVAILABLE_SOURCES_UI = [
32
- "Towards AI",
33
- "HuggingFace",
34
- "Wikipedia",
35
- "Gen AI 360: LangChain",
36
  "Gen AI 360: LLMs",
37
- "Activeloop",
38
- "Open AI",
 
 
 
 
 
39
  ]
40
 
41
  AVAILABLE_SOURCES = [
 
 
42
  "towards_ai",
 
43
  "hf_transformers",
44
  "wikipedia",
45
- "langchain_course",
46
- "llm_course",
47
- "activeloop",
48
  "openai",
 
49
  ]
50
 
 
 
 
 
 
 
 
 
51
 
52
  def log_likes(completion: Completion, like_data: gr.LikeData):
53
  # make it a str so json-parsable
@@ -92,24 +94,16 @@ def format_sources(matched_documents: pd.DataFrame) -> str:
92
  matched_documents.similarity_to_answer = (
93
  matched_documents.similarity_to_answer * 100
94
  )
95
-
96
- # matched_documents["repetition"] = matched_documents.groupby("title")[
97
- # "title"
98
- # ].transform("size")
99
-
100
- # drop duplicates, keep highest ranking ones
101
  matched_documents = matched_documents.sort_values(
102
  "similarity_to_answer", ascending=False
103
  ).drop_duplicates("title", keep="first")
104
 
105
- # Revert back to correct display
106
  display_source_to_ui = {
107
  ui: src for ui, src in zip(AVAILABLE_SOURCES, AVAILABLE_SOURCES_UI)
108
  }
109
  matched_documents["source"] = matched_documents["source"].replace(
110
  display_source_to_ui
111
  )
112
-
113
  documents = "\n".join(
114
  [
115
  document_template.format(document=document)
@@ -136,7 +130,7 @@ def user(user_input, history):
136
 
137
  def get_empty_source_completion(user_input):
138
  return Completion(
139
- user_input=user_input,
140
  answer_text="You have to select at least one source from the dropdown menu.",
141
  matched_documents=pd.DataFrame(),
142
  error=False,
@@ -166,15 +160,7 @@ def get_answer(history, sources: Optional[list[str]] = None):
166
  yield history, completion
167
 
168
 
169
- # CSS = """
170
- # .contain { display: flex; flex-direction: column; }
171
- # .gradio-container { height: 100vh !important; }
172
- # #component-0 { height: 100%; }
173
- # #chatbot { flex-grow: 1; overflow: auto;}
174
- # """
175
  theme = gr.themes.Soft()
176
- # theme.block_background_fill
177
- # demo = gr.Blocks(theme=theme)
178
  with gr.Blocks(
179
  theme=gr.themes.Soft(
180
  primary_hue="blue",
 
18
  import cfg
19
  from cfg import setup_buster
20
 
 
 
 
 
 
 
 
 
21
  CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
22
 
23
  AVAILABLE_SOURCES_UI = [
 
 
 
 
24
  "Gen AI 360: LLMs",
25
+ "Gen AI 360: LangChain",
26
+ "Towards AI Blog",
27
+ "Activeloop Docs",
28
+ "HF Transformers Docs",
29
+ "Wikipedia",
30
+ "OpenAI Docs",
31
+ "LangChain Docs",
32
  ]
33
 
34
  AVAILABLE_SOURCES = [
35
+ "llm_course",
36
+ "langchain_course",
37
  "towards_ai",
38
+ "activeloop",
39
  "hf_transformers",
40
  "wikipedia",
 
 
 
41
  "openai",
42
+ "langchain_docs",
43
  ]
44
 
45
+ buster = setup_buster(cfg.buster_cfg)
46
+
47
+ # suppress httpx logs they are spammy and uninformative
48
+ logging.getLogger("httpx").setLevel(logging.WARNING)
49
+
50
+ logger = logging.getLogger(__name__)
51
+ logging.basicConfig(level=logging.INFO)
52
+
53
 
54
  def log_likes(completion: Completion, like_data: gr.LikeData):
55
  # make it a str so json-parsable
 
94
  matched_documents.similarity_to_answer = (
95
  matched_documents.similarity_to_answer * 100
96
  )
 
 
 
 
 
 
97
  matched_documents = matched_documents.sort_values(
98
  "similarity_to_answer", ascending=False
99
  ).drop_duplicates("title", keep="first")
100
 
 
101
  display_source_to_ui = {
102
  ui: src for ui, src in zip(AVAILABLE_SOURCES, AVAILABLE_SOURCES_UI)
103
  }
104
  matched_documents["source"] = matched_documents["source"].replace(
105
  display_source_to_ui
106
  )
 
107
  documents = "\n".join(
108
  [
109
  document_template.format(document=document)
 
130
 
131
  def get_empty_source_completion(user_input):
132
  return Completion(
133
+ user_inputs=user_input,
134
  answer_text="You have to select at least one source from the dropdown menu.",
135
  matched_documents=pd.DataFrame(),
136
  error=False,
 
160
  yield history, completion
161
 
162
 
 
 
 
 
 
 
163
  theme = gr.themes.Soft()
 
 
164
  with gr.Blocks(
165
  theme=gr.themes.Soft(
166
  primary_hue="blue",
cfg.py CHANGED
@@ -23,7 +23,7 @@ ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")
23
  if ACTIVELOOP_TOKEN is None:
24
  logger.warning("No activeloop token found, you will not be able to fetch data.")
25
 
26
- DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
27
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
28
 
29
  # if you want to use a local dataset, set the env. variable, it overrides all others
 
23
  if ACTIVELOOP_TOKEN is None:
24
  logger.warning("No activeloop token found, you will not be able to fetch data.")
25
 
26
+ DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset_debug")
27
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
28
 
29
  # if you want to use a local dataset, set the env. variable, it overrides all others
data/tmp.py DELETED
@@ -1,21 +0,0 @@
1
- # import pandas as pd
2
-
3
- # # Load the CSV
4
- # df = pd.read_csv('data/wiki.csv')
5
-
6
-
7
- # # Count the number of unique titles in the 'title' column
8
- # unique_titles_count = df['title']
9
-
10
- # print(len(df))
11
-
12
- # # # Remove the 'ranking' column
13
- # # df.drop('ranking', axis=1, inplace=True)
14
-
15
- # # # Save the CSV again
16
- # # df.to_csv('data/wiki.csv', index=False)
17
-
18
-
19
- import gradio as gr
20
-
21
- gr.themes.builder()