barunsaha commited on
Commit
abd7b16
1 Parent(s): 7a2333b

Update JSON cleaning logic and logs

Browse files
Files changed (1) hide show
  1. app.py +69 -43
app.py CHANGED
@@ -12,7 +12,7 @@ from langchain_community.chat_message_histories import (
12
  )
13
  from langchain_core.messages import HumanMessage
14
  from langchain_core.prompts import ChatPromptTemplate
15
- from transformers import AutoTokenizer
16
 
17
  from global_config import GlobalConfig
18
  from helpers import llm_helper, pptx_helper
@@ -48,17 +48,17 @@ def _get_prompt_template(is_refinement: bool) -> str:
48
  return template
49
 
50
 
51
- @st.cache_resource
52
- def _get_tokenizer() -> AutoTokenizer:
53
- """
54
- Get Mistral tokenizer for counting tokens.
55
-
56
- :return: The tokenizer.
57
- """
58
-
59
- return AutoTokenizer.from_pretrained(
60
- pretrained_model_name_or_path=GlobalConfig.HF_LLM_MODEL_NAME
61
- )
62
 
63
 
64
  APP_TEXT = _load_strings()
@@ -139,10 +139,8 @@ def set_up_chat_ui():
139
 
140
  if _is_it_refinement():
141
  template = _get_prompt_template(is_refinement=True)
142
- logger.debug('Getting refinement template')
143
  else:
144
  template = _get_prompt_template(is_refinement=False)
145
- logger.debug('Getting initial template')
146
 
147
  prompt_template = ChatPromptTemplate.from_template(template)
148
 
@@ -215,14 +213,14 @@ def set_up_chat_ui():
215
  history.add_user_message(prompt)
216
  history.add_ai_message(response)
217
 
218
- if GlobalConfig.COUNT_TOKENS:
219
- tokenizer = _get_tokenizer()
220
- tokens_count_in = len(tokenizer.tokenize(formatted_template))
221
- tokens_count_out = len(tokenizer.tokenize(response))
222
- logger.debug(
223
- 'Tokens count:: input: %d, output: %d',
224
- tokens_count_in, tokens_count_out
225
- )
226
 
227
  # _display_messages_history(view_messages)
228
 
@@ -237,6 +235,11 @@ def set_up_chat_ui():
237
  generate_slide_deck(response_cleaned)
238
  progress_bar_pptx.progress(100, text='Done!')
239
 
 
 
 
 
 
240
 
241
  def generate_slide_deck(json_str: str):
242
  """
@@ -247,12 +250,10 @@ def generate_slide_deck(json_str: str):
247
 
248
  if DOWNLOAD_FILE_KEY in st.session_state:
249
  path = pathlib.Path(st.session_state[DOWNLOAD_FILE_KEY])
250
- logger.debug('DOWNLOAD_FILE_KEY found in session')
251
  else:
252
  temp = tempfile.NamedTemporaryFile(delete=False, suffix='.pptx')
253
  path = pathlib.Path(temp.name)
254
  st.session_state[DOWNLOAD_FILE_KEY] = str(path)
255
- logger.debug('DOWNLOAD_FILE_KEY not found in session')
256
 
257
  if temp:
258
  temp.close()
@@ -268,7 +269,15 @@ def generate_slide_deck(json_str: str):
268
 
269
  _display_download_button(path)
270
  except ValueError as ve:
271
- st.error(APP_TEXT['json_parsing_error'])
 
 
 
 
 
 
 
 
272
  logger.error('%s', APP_TEXT['json_parsing_error'])
273
  logger.error('Additional error info: %s', str(ve))
274
  except Exception as ex:
@@ -347,34 +356,51 @@ def _display_messages_history(view_messages: st.expander):
347
  def _clean_json(json_str: str) -> str:
348
  """
349
  Attempt to clean a JSON response string from the LLM by removing the trailing ```
350
- and any text beyond that. May not be always accurate.
 
351
 
352
  :param json_str: The input string in JSON format.
353
  :return: The "cleaned" JSON string.
354
  """
355
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  str_len = len(json_str)
357
  response_cleaned = json_str
358
 
359
- try:
360
- idx = json_str.rindex('```')
361
- logger.debug(
362
- 'Fixing JSON response: str_len: %d, idx of ```: %d',
363
- str_len, idx
364
- )
 
 
 
 
365
 
366
- if idx + 3 == str_len:
367
- # The response ends with ``` -- most likely the end of JSON response string
368
  response_cleaned = json_str[:idx]
369
- elif idx + 3 < str_len:
370
- # Looks like there are some more content beyond the last ```
371
- # In the best case, it would be some additional plain-text response from the LLM
372
- # and is unlikely to contain } or ] that are present in JSON
373
- if '}' not in json_str[idx + 3:]: # the remainder of the text
374
- response_cleaned = json_str[:idx]
375
- except ValueError:
376
- # No ``` found
377
- pass
 
378
 
379
  return response_cleaned
380
 
 
12
  )
13
  from langchain_core.messages import HumanMessage
14
  from langchain_core.prompts import ChatPromptTemplate
15
+ # from transformers import AutoTokenizer
16
 
17
  from global_config import GlobalConfig
18
  from helpers import llm_helper, pptx_helper
 
48
  return template
49
 
50
 
51
+ # @st.cache_resource
52
+ # def _get_tokenizer() -> AutoTokenizer:
53
+ # """
54
+ # Get Mistral tokenizer for counting tokens.
55
+ #
56
+ # :return: The tokenizer.
57
+ # """
58
+ #
59
+ # return AutoTokenizer.from_pretrained(
60
+ # pretrained_model_name_or_path=GlobalConfig.HF_LLM_MODEL_NAME
61
+ # )
62
 
63
 
64
  APP_TEXT = _load_strings()
 
139
 
140
  if _is_it_refinement():
141
  template = _get_prompt_template(is_refinement=True)
 
142
  else:
143
  template = _get_prompt_template(is_refinement=False)
 
144
 
145
  prompt_template = ChatPromptTemplate.from_template(template)
146
 
 
213
  history.add_user_message(prompt)
214
  history.add_ai_message(response)
215
 
216
+ # if GlobalConfig.COUNT_TOKENS:
217
+ # tokenizer = _get_tokenizer()
218
+ # tokens_count_in = len(tokenizer.tokenize(formatted_template))
219
+ # tokens_count_out = len(tokenizer.tokenize(response))
220
+ # logger.debug(
221
+ # 'Tokens count:: input: %d, output: %d',
222
+ # tokens_count_in, tokens_count_out
223
+ # )
224
 
225
  # _display_messages_history(view_messages)
226
 
 
235
  generate_slide_deck(response_cleaned)
236
  progress_bar_pptx.progress(100, text='Done!')
237
 
238
+ logger.info(
239
+ '#messages in history / 2: %d',
240
+ len(st.session_state[CHAT_MESSAGES]) / 2
241
+ )
242
+
243
 
244
  def generate_slide_deck(json_str: str):
245
  """
 
250
 
251
  if DOWNLOAD_FILE_KEY in st.session_state:
252
  path = pathlib.Path(st.session_state[DOWNLOAD_FILE_KEY])
 
253
  else:
254
  temp = tempfile.NamedTemporaryFile(delete=False, suffix='.pptx')
255
  path = pathlib.Path(temp.name)
256
  st.session_state[DOWNLOAD_FILE_KEY] = str(path)
 
257
 
258
  if temp:
259
  temp.close()
 
269
 
270
  _display_download_button(path)
271
  except ValueError as ve:
272
+ st.error(
273
+ f"{APP_TEXT['json_parsing_error']}"
274
+ f"\n\nAdditional error info: {ve}"
275
+ f"\n\nHere are some sample instructions that you could try to possibly fix this error;"
276
+ f"if these don't work, try rephrasing or refreshing:"
277
+ f"\n\n"
278
+ "- Regenerate content and fix the JSON error."
279
+ "\n- Regenerate content and fix the JSON error. Quotes inside quotes should be escaped."
280
+ )
281
  logger.error('%s', APP_TEXT['json_parsing_error'])
282
  logger.error('Additional error info: %s', str(ve))
283
  except Exception as ex:
 
356
  def _clean_json(json_str: str) -> str:
357
  """
358
  Attempt to clean a JSON response string from the LLM by removing the trailing ```
359
+ and any text beyond that.
360
+ CAUTION: May not be always accurate.
361
 
362
  :param json_str: The input string in JSON format.
363
  :return: The "cleaned" JSON string.
364
  """
365
 
366
+ # An example of response containing JSON and other text:
367
+ # {
368
+ # "title": "AI and the Future: A Transformative Journey",
369
+ # "slides": [
370
+ # ...
371
+ # ]
372
+ # } <<---- This is end of valid JSON content
373
+ # ```
374
+ #
375
+ # ```vbnet
376
+ # Please note that the JSON output is in valid format but the content of the "Role of GPUs in AI" slide is just an example and may not be factually accurate. For accurate information, you should consult relevant resources and update the content accordingly.
377
+ # ```
378
  str_len = len(json_str)
379
  response_cleaned = json_str
380
 
381
+ while True:
382
+ idx = json_str.rfind('```') # -1 on failure
383
+
384
+ if idx <= 0:
385
+ break
386
+
387
+ # In the ideal scenario, the character before the last ``` should be
388
+ # a new line or a closing bracket }
389
+ prev_char = json_str[idx - 1]
390
+ print(f'{idx=}, {prev_char=}')
391
 
392
+ if prev_char == '}':
 
393
  response_cleaned = json_str[:idx]
394
+ elif prev_char == '\n' and json_str[idx - 2] == '}':
395
+ response_cleaned = json_str[:idx]
396
+
397
+ json_str = json_str[:idx]
398
+
399
+ logger.info(
400
+ 'Cleaning JSON response:: original length: %d | cleaned length: %d',
401
+ str_len, len(response_cleaned)
402
+ )
403
+ logger.debug('Cleaned JSON: %s', response_cleaned)
404
 
405
  return response_cleaned
406