Léo Bourrel commited on
Commit
39a3f86
·
1 Parent(s): 8ee0a1b

feat: install pre-commit && clean

Browse files
.pre-commit-config.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_install_hook_types:
2
+ # Mandatory to install both pre-commit and pre-push hooks (see https://pre-commit.com/#top_level-default_install_hook_types)
3
+ # Add new hook types here to ensure automatic installation when running `pre-commit install`
4
+ - pre-commit
5
+ - pre-push
6
+ repos:
7
+ - repo: https://github.com/pre-commit/pre-commit-hooks
8
+ rev: v4.3.0
9
+ hooks:
10
+ - id: trailing-whitespace
11
+ - id: end-of-file-fixer
12
+ - id: check-yaml
13
+ - id: check-json
14
+ - id: check-added-large-files
15
+
16
+ - repo: https://github.com/srstevenson/nb-clean
17
+ rev: 3.0.0
18
+ hooks:
19
+ - id: nb-clean
20
+ args:
21
+ - --remove-empty-cells
22
+ - --preserve-cell-metadata
23
+ - --
24
+
25
+ # - repo: https://github.com/pre-commit/mirrors-mypy
26
+ # rev: 'v1.5.1'
27
+ # hooks:
28
+ # - id: mypy
29
+
30
+ - repo: local
31
+ hooks:
32
+ - id: black
33
+ name: Formatting (black)
34
+ entry: black
35
+ language: system
36
+ types: [python]
37
+ stages: [commit]
38
+ # - id: ruff
39
+ # name: Linter (ruff)
40
+ # entry: ruff
41
+ # language: system
42
+ # types: [python]
43
+ # stages: [commit]
44
+ # - id: test
45
+ # name: Unit tests (pytest)
46
+ # entry: make test
47
+ # pass_filenames: false
48
+ # language: system
49
+ # types: [python]
50
+ # stages: [push]
51
+ # - id: dvc-pre-push
52
+ # name: DVC pre-push
53
+ # entry: dvc
54
+ # args:
55
+ # - git-hook
56
+ # - pre-push
57
+ # require_serial: true
58
+ # verbose: true
59
+ # language: system
60
+ # stages: [push]
Dockerfile CHANGED
@@ -53,4 +53,4 @@ STOPSIGNAL SIGINT
53
 
54
  HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
55
 
56
- CMD ["postgres"]
 
53
 
54
  HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
55
 
56
+ CMD ["postgres"]
execution.sh CHANGED
@@ -4,4 +4,4 @@ bash /usr/local/bin/docker-entrypoint.sh "$@" &
4
  postgres &
5
  sleep 2
6
 
7
- streamlit run sorbobotapp/app.py --server.port=7860 --server.address=0.0.0.0
 
4
  postgres &
5
  sleep 2
6
 
7
+ streamlit run sorbobotapp/app.py --server.port=7860 --server.address=0.0.0.0
requirements.txt CHANGED
@@ -1,12 +1,14 @@
 
1
  gpt4all==1.0.12
2
  langchain==0.0.313
3
  openai==0.28.1
4
  pandas==2.1.1
5
  pgvector==0.2.3
 
6
  psycopg2-binary==2.9.9
7
  psycopg2==2.9.9
8
  streamlit==1.27.2
9
  streamlit-chat==0.1.1
10
  SQLAlchemy==2.0.22
11
  sqlite-vss==0.1.2
12
- tiktoken==0.5.1
 
1
+ black==23.11.0
2
  gpt4all==1.0.12
3
  langchain==0.0.313
4
  openai==0.28.1
5
  pandas==2.1.1
6
  pgvector==0.2.3
7
+ pre-commit==3.5.0
8
  psycopg2-binary==2.9.9
9
  psycopg2==2.9.9
10
  streamlit==1.27.2
11
  streamlit-chat==0.1.1
12
  SQLAlchemy==2.0.22
13
  sqlite-vss==0.1.2
14
+ tiktoken==0.5.1
setup.py CHANGED
@@ -8,4 +8,3 @@ setup(
8
  authors=["Leo Bourrel <leo@ia-lab.fr>"],
9
  package_dir={"": "sorbobotapp"},
10
  )
11
-
 
8
  authors=["Leo Bourrel <leo@ia-lab.fr>"],
9
  package_dir={"": "sorbobotapp"},
10
  )
 
sorbobotapp/app.py CHANGED
@@ -45,8 +45,12 @@ def send_message_callback():
45
  )
46
  st.session_state.token_count += cb.total_tokens
47
  if os.environ.get("ENVIRONMENT") == "dev":
48
- history_id = insert_chat_history(conn, human_prompt, llm_response["answer"])
49
- insert_chat_history_articles(conn, history_id, llm_response["source_documents"])
 
 
 
 
50
 
51
 
52
  def exemple_message_callback_button(args):
@@ -90,10 +94,10 @@ with chat_column:
90
 
91
  for chat in st.session_state.history:
92
  div = f"""
93
- <div class="chat-row
94
  {'' if chat.origin == 'ai' else 'row-reverse'}">
95
  <img class="chat-icon" src="https://cdn-icons-png.flaticon.com/512/{
96
- '1129/1129398.png' if chat.origin == 'ai'
97
  else '1077/1077012.png'}"
98
  width=32 height=32>
99
  <div class="chat-bubble
@@ -128,16 +132,18 @@ with chat_column:
128
  exemple,
129
  key=f"{idx_exemple}_button",
130
  on_click=exemple_message_callback_button,
131
- args=(exemple,)
132
  )
133
 
134
- st.button(":new: Start a new conversation", on_click=clear_history, type="secondary")
 
 
135
 
136
  if os.environ.get("ENVIRONMENT") == "dev":
137
  information_placeholder.caption(
138
  f"""
139
  Used {st.session_state.token_count} tokens \n
140
- Debug Langchain conversation:
141
  {st.session_state.history}
142
  """
143
  )
@@ -175,7 +181,9 @@ with doc_column:
175
  doc_metadata = doc.metadata
176
 
177
  expander = st.expander(doc_content["title"])
178
- expander.markdown(f"**HalID** : https://hal.science/{doc_metadata['hal_id']}")
 
 
179
  expander.markdown(doc_metadata["abstract"])
180
  expander.markdown(f"**Authors** : {doc_content['authors']}")
181
  expander.markdown(f"**Keywords** : {doc_content['keywords']}")
 
45
  )
46
  st.session_state.token_count += cb.total_tokens
47
  if os.environ.get("ENVIRONMENT") == "dev":
48
+ history_id = insert_chat_history(
49
+ conn, human_prompt, llm_response["answer"]
50
+ )
51
+ insert_chat_history_articles(
52
+ conn, history_id, llm_response["source_documents"]
53
+ )
54
 
55
 
56
  def exemple_message_callback_button(args):
 
94
 
95
  for chat in st.session_state.history:
96
  div = f"""
97
+ <div class="chat-row
98
  {'' if chat.origin == 'ai' else 'row-reverse'}">
99
  <img class="chat-icon" src="https://cdn-icons-png.flaticon.com/512/{
100
+ '1129/1129398.png' if chat.origin == 'ai'
101
  else '1077/1077012.png'}"
102
  width=32 height=32>
103
  <div class="chat-bubble
 
132
  exemple,
133
  key=f"{idx_exemple}_button",
134
  on_click=exemple_message_callback_button,
135
+ args=(exemple,),
136
  )
137
 
138
+ st.button(
139
+ ":new: Start a new conversation", on_click=clear_history, type="secondary"
140
+ )
141
 
142
  if os.environ.get("ENVIRONMENT") == "dev":
143
  information_placeholder.caption(
144
  f"""
145
  Used {st.session_state.token_count} tokens \n
146
+ Debug Langchain conversation:
147
  {st.session_state.history}
148
  """
149
  )
 
181
  doc_metadata = doc.metadata
182
 
183
  expander = st.expander(doc_content["title"])
184
+ expander.markdown(
185
+ f"**HalID** : https://hal.science/{doc_metadata['hal_id']}"
186
+ )
187
  expander.markdown(doc_metadata["abstract"])
188
  expander.markdown(f"**Authors** : {doc_content['authors']}")
189
  expander.markdown(f"**Keywords** : {doc_content['keywords']}")
sorbobotapp/conversation_retrieval_chain.py CHANGED
@@ -5,7 +5,9 @@ from typing import Any, Dict, Optional
5
  from keyword_extraction import KeywordExtractor
6
  from langchain.callbacks.manager import CallbackManagerForChainRun
7
  from langchain.chains.conversational_retrieval.base import (
8
- ConversationalRetrievalChain, _get_chat_history)
 
 
9
  from langchain.schema import Document
10
 
11
 
 
5
  from keyword_extraction import KeywordExtractor
6
  from langchain.callbacks.manager import CallbackManagerForChainRun
7
  from langchain.chains.conversational_retrieval.base import (
8
+ ConversationalRetrievalChain,
9
+ _get_chat_history,
10
+ )
11
  from langchain.schema import Document
12
 
13
 
sorbobotapp/static/styles.css CHANGED
@@ -33,4 +33,4 @@
33
 
34
  .chat-icon {
35
  border-radius: 5px;
36
- }
 
33
 
34
  .chat-icon {
35
  border-radius: 5px;
36
+ }
sorbobotapp/vector_store.py CHANGED
@@ -222,7 +222,7 @@ class CustomVectorStore(VectorStore):
222
  return self._results_to_docs_and_scores(results)
223
 
224
  @staticmethod
225
- def _fetch_title(title:str, abstract:str):
226
  if len(title) > 0:
227
  return title
228
  return abstract.split(".")[0]
@@ -234,7 +234,9 @@ class CustomVectorStore(VectorStore):
234
  Document(
235
  page_content=json.dumps(
236
  {
237
- "title": self._fetch_title(result["title"][0], result["abstract"][0]),
 
 
238
  "authors": result["authors"],
239
  "keywords": result["keywords"],
240
  }
@@ -271,14 +273,14 @@ class CustomVectorStore(VectorStore):
271
  a.doi,
272
  a.hal_id,
273
  a.abstract_en,
274
- string_agg(distinct keyword."name", ', ') as keywords,
275
  string_agg(distinct author."name", ', ') as authors,
276
  abstract_embedding_en {self.distance_strategy} '{str(embedding)}' as distance
277
  from article a
278
- left join article_keyword ON article_keyword.article_id = a.id
279
  left join keyword on article_keyword.keyword_id = keyword.id
280
  left join article_author ON article_author.article_id = a.id
281
- left join author on author.id = article_author.author_id
282
  where
283
  abstract_en != '' and
284
  abstract_en != 'None' and
 
222
  return self._results_to_docs_and_scores(results)
223
 
224
  @staticmethod
225
+ def _fetch_title(title: str, abstract: str):
226
  if len(title) > 0:
227
  return title
228
  return abstract.split(".")[0]
 
234
  Document(
235
  page_content=json.dumps(
236
  {
237
+ "title": self._fetch_title(
238
+ result["title"][0], result["abstract"][0]
239
+ ),
240
  "authors": result["authors"],
241
  "keywords": result["keywords"],
242
  }
 
273
  a.doi,
274
  a.hal_id,
275
  a.abstract_en,
276
+ string_agg(distinct keyword."name", ', ') as keywords,
277
  string_agg(distinct author."name", ', ') as authors,
278
  abstract_embedding_en {self.distance_strategy} '{str(embedding)}' as distance
279
  from article a
280
+ left join article_keyword ON article_keyword.article_id = a.id
281
  left join keyword on article_keyword.keyword_id = keyword.id
282
  left join article_author ON article_author.article_id = a.id
283
+ left join author on author.id = article_author.author_id
284
  where
285
  abstract_en != '' and
286
  abstract_en != 'None' and