svilens commited on
Commit
e6fe32a
·
1 Parent(s): ed4e4ea

added console logs

Browse files
Files changed (6) hide show
  1. app.py +3 -1
  2. doc_parser.py +6 -1
  3. generate_qa.py +13 -3
  4. requirements.txt +1 -0
  5. tts.py +7 -0
  6. utils.py +28 -0
app.py CHANGED
@@ -14,7 +14,7 @@ from generate_qa import (
14
  generate_outro,
15
  )
16
  from tts import speak, get_voices
17
- from utils import get_config, get_gender
18
 
19
  from bs4 import BeautifulSoup
20
  import gradio as gr
@@ -25,6 +25,7 @@ import uuid
25
 
26
 
27
  CONFIG = get_config()
 
28
 
29
 
30
  def update_subcategory_dropdown(selection):
@@ -33,6 +34,7 @@ def update_subcategory_dropdown(selection):
33
  id_title_list = get_papers_by_category() # Global variable to store the list of papers, no time left to make this right
34
 
35
  def fetch_and_update_paper_list(category, subcategory):
 
36
  global id_title_list
37
  subcategory_id = [key for key, value in arxiv_categories[category].items() if value == subcategory][0]
38
  id_title_list = get_papers_by_category(category=subcategory_id)
 
14
  generate_outro,
15
  )
16
  from tts import speak, get_voices
17
+ from utils import get_config, get_gender, setup_logger
18
 
19
  from bs4 import BeautifulSoup
20
  import gradio as gr
 
25
 
26
 
27
  CONFIG = get_config()
28
+ logger = setup_logger(__name__)
29
 
30
 
31
  def update_subcategory_dropdown(selection):
 
34
  id_title_list = get_papers_by_category() # Global variable to store the list of papers, no time left to make this right
35
 
36
  def fetch_and_update_paper_list(category, subcategory):
37
+ logger.debug(f'Updating the papers list to {category} -> {subcategory}')
38
  global id_title_list
39
  subcategory_id = [key for key, value in arxiv_categories[category].items() if value == subcategory][0]
40
  id_title_list = get_papers_by_category(category=subcategory_id)
doc_parser.py CHANGED
@@ -3,10 +3,11 @@ from bs4 import BeautifulSoup
3
  import requests
4
  from collections import OrderedDict
5
 
6
- from utils import get_config
7
 
8
 
9
  CONFIG = get_config()
 
10
 
11
 
12
  def get_papers_by_category(
@@ -16,6 +17,7 @@ def get_papers_by_category(
16
  """
17
  Returns the IDs of the most recent papers from a specific category
18
  """
 
19
  search = arxiv.Search(
20
  query=f"cat:{category}",
21
  max_results=max_results,
@@ -30,6 +32,7 @@ def get_paper_by_id(id: str, local_dir: str = './', filename: str = 'temp.pdf'):
30
  """
31
  Downloads a paper by a given ID as PDF
32
  """
 
33
  client = arxiv.Client()
34
  paper = next(client.results(arxiv.Search(id_list=[id])))
35
  paper.download_pdf(dirpath=local_dir, filename=filename)
@@ -41,6 +44,7 @@ def parse_pdf(
41
  file_path: str,
42
  endpoint: str = CONFIG.get("grobid").get("endpoint")
43
  ) -> str:
 
44
  f = {'input': open(file_path, 'rb')}
45
  response = requests.post(
46
  endpoint,
@@ -55,6 +59,7 @@ def parse_xml(soup_obj: BeautifulSoup) -> OrderedDict:
55
  and returns a dict with section titles as keys and content
56
  of paragraphs as values
57
  """
 
58
  divs = soup_obj.find_all('div')
59
 
60
  content_dict = OrderedDict()
 
3
  import requests
4
  from collections import OrderedDict
5
 
6
+ from utils import get_config, setup_logger
7
 
8
 
9
  CONFIG = get_config()
10
+ logger = setup_logger(__name__)
11
 
12
 
13
  def get_papers_by_category(
 
17
  """
18
  Returns the IDs of the most recent papers from a specific category
19
  """
20
+ logger.debug(f'Searching for {max_results} papers in {category} category')
21
  search = arxiv.Search(
22
  query=f"cat:{category}",
23
  max_results=max_results,
 
32
  """
33
  Downloads a paper by a given ID as PDF
34
  """
35
+ logger.debug(f'Downloading paper {id} to {local_dir}/{filename}')
36
  client = arxiv.Client()
37
  paper = next(client.results(arxiv.Search(id_list=[id])))
38
  paper.download_pdf(dirpath=local_dir, filename=filename)
 
44
  file_path: str,
45
  endpoint: str = CONFIG.get("grobid").get("endpoint")
46
  ) -> str:
47
+ logger.debug('Parsing the PDF')
48
  f = {'input': open(file_path, 'rb')}
49
  response = requests.post(
50
  endpoint,
 
59
  and returns a dict with section titles as keys and content
60
  of paragraphs as values
61
  """
62
+ logger.debug('Parsing the XML sections')
63
  divs = soup_obj.find_all('div')
64
 
65
  content_dict = OrderedDict()
generate_qa.py CHANGED
@@ -3,15 +3,18 @@ from collections import OrderedDict
3
  from utils import (
4
  get_config,
5
  HuggingFaceLLM,
6
- text_cleanup
 
7
  )
8
 
9
 
10
  PROMPTS = get_config('prompts.yaml')
11
  llm = HuggingFaceLLM()
 
12
 
13
 
14
  def generate_summary(sections_dict: OrderedDict, intro_only: bool = True):
 
15
  global llm
16
  sections = list(sections_dict.keys())[:1] if intro_only else list(sections_dict.keys())
17
  for section_title in sections:
@@ -35,6 +38,7 @@ def generate_question(paragraph: str, last_answer: str):
35
  return
36
  if question.count('?') > 1:
37
  question = question.split('?')[0] + '?'
 
38
  return text_cleanup(question)
39
 
40
 
@@ -48,10 +52,12 @@ def generate_answer(paragraph: str, question: str, title: str):
48
  ) + paragraph,
49
  system_prompt=PROMPTS['guest']['sections']['system_prompt']
50
  )
 
51
  return text_cleanup(answer)
52
 
53
 
54
  def process_qa_chain(sections_dict: OrderedDict, paper_title):
 
55
  section_keys = list(sections_dict.keys())[1:]
56
  answer = ''
57
  for section in section_keys:
@@ -60,14 +66,13 @@ def process_qa_chain(sections_dict: OrderedDict, paper_title):
60
  question = generate_question(paragraph, answer)
61
  if question:
62
  yield question
63
- print(f"HOST: {question}")
64
  answer = generate_answer(paragraph, question, paper_title)
65
  if answer:
66
- print(f"GUEST: {answer}")
67
  yield answer
68
 
69
 
70
  def generate_intro(author: str, title: str):
 
71
  global llm
72
  host_intro = llm.call(
73
  system_prompt=PROMPTS['host']['intro']['system_prompt'],
@@ -76,6 +81,7 @@ def generate_intro(author: str, title: str):
76
  title=title
77
  )
78
  )
 
79
  yield text_cleanup(host_intro)
80
  guest_intro = llm.call(
81
  system_prompt=PROMPTS['guest']['intro']['system_prompt'].format(
@@ -86,6 +92,7 @@ def generate_intro(author: str, title: str):
86
  host_intro=host_intro
87
  )
88
  )
 
89
  yield text_cleanup(guest_intro)
90
 
91
 
@@ -94,6 +101,7 @@ def generate_intro_summary_question():
94
 
95
 
96
  def generate_outro(author: str, title: str):
 
97
  global llm
98
  host_outro = llm.call(
99
  system_prompt=PROMPTS['host']['outro']['system_prompt'],
@@ -102,6 +110,7 @@ def generate_outro(author: str, title: str):
102
  title=title
103
  )
104
  )
 
105
  yield text_cleanup(host_outro)
106
  guest_outro = llm.call(
107
  system_prompt=PROMPTS['guest']['outro']['system_prompt'].format(
@@ -112,4 +121,5 @@ def generate_outro(author: str, title: str):
112
  host_outro=host_outro
113
  )
114
  )
 
115
  yield text_cleanup(guest_outro)
 
3
  from utils import (
4
  get_config,
5
  HuggingFaceLLM,
6
+ text_cleanup,
7
+ setup_logger,
8
  )
9
 
10
 
11
  PROMPTS = get_config('prompts.yaml')
12
  llm = HuggingFaceLLM()
13
+ logger = setup_logger(__name__)
14
 
15
 
16
  def generate_summary(sections_dict: OrderedDict, intro_only: bool = True):
17
+ logger.debug('Generating paper summary')
18
  global llm
19
  sections = list(sections_dict.keys())[:1] if intro_only else list(sections_dict.keys())
20
  for section_title in sections:
 
38
  return
39
  if question.count('?') > 1:
40
  question = question.split('?')[0] + '?'
41
+ logger.debug(f"HOST: {question[:100]}...{question[-100:]}")
42
  return text_cleanup(question)
43
 
44
 
 
52
  ) + paragraph,
53
  system_prompt=PROMPTS['guest']['sections']['system_prompt']
54
  )
55
+ logger.debug(f"GUEST: {answer[:100]}...{answer[-100:]}")
56
  return text_cleanup(answer)
57
 
58
 
59
  def process_qa_chain(sections_dict: OrderedDict, paper_title):
60
+ logger.debug('Generating Q&A pairs')
61
  section_keys = list(sections_dict.keys())[1:]
62
  answer = ''
63
  for section in section_keys:
 
66
  question = generate_question(paragraph, answer)
67
  if question:
68
  yield question
 
69
  answer = generate_answer(paragraph, question, paper_title)
70
  if answer:
 
71
  yield answer
72
 
73
 
74
  def generate_intro(author: str, title: str):
75
+ logger.debug('Generating podcast introduction')
76
  global llm
77
  host_intro = llm.call(
78
  system_prompt=PROMPTS['host']['intro']['system_prompt'],
 
81
  title=title
82
  )
83
  )
84
+ logger.debug(f"HOST: {host_intro[:100]}...{host_intro[-100:]}")
85
  yield text_cleanup(host_intro)
86
  guest_intro = llm.call(
87
  system_prompt=PROMPTS['guest']['intro']['system_prompt'].format(
 
92
  host_intro=host_intro
93
  )
94
  )
95
+ logger.debug(f"GUEST: {guest_intro[:100]}...{guest_intro[-100:]}")
96
  yield text_cleanup(guest_intro)
97
 
98
 
 
101
 
102
 
103
  def generate_outro(author: str, title: str):
104
+ logger.debug('Generating podcast outro')
105
  global llm
106
  host_outro = llm.call(
107
  system_prompt=PROMPTS['host']['outro']['system_prompt'],
 
110
  title=title
111
  )
112
  )
113
+ logger.debug(f"HOST: {host_outro[:100]}...{host_outro[-100:]}")
114
  yield text_cleanup(host_outro)
115
  guest_outro = llm.call(
116
  system_prompt=PROMPTS['guest']['outro']['system_prompt'].format(
 
121
  host_outro=host_outro
122
  )
123
  )
124
+ logger.debug(f"GUEST: {guest_outro[:100]}...{guest_outro[-100:]}")
125
  yield text_cleanup(guest_outro)
requirements.txt CHANGED
@@ -10,3 +10,4 @@ lxml==5.3.0
10
  bs4==0.0.2
11
  arxiv==2.1.3
12
  gender-guesser==0.4.0
 
 
10
  bs4==0.0.2
11
  arxiv==2.1.3
12
  gender-guesser==0.4.0
13
+ colorlog==6.8.2
tts.py CHANGED
@@ -1,14 +1,20 @@
 
 
1
  import asyncio
2
  import edge_tts
3
  from io import BytesIO
4
 
5
 
 
 
 
6
  def speak(
7
  input_text: str,
8
  voice: str = 'en-US-AvaMultilingualNeural',
9
  output_duration: bool = False,
10
  correction_factor: float = 1.15 # Slightly increase the wait time (magic number with experimentation)
11
  ):
 
12
  communicate = edge_tts.Communicate(input_text, voice)
13
  audio_stream = BytesIO()
14
 
@@ -28,6 +34,7 @@ def speak(
28
 
29
 
30
  def get_voices():
 
31
  voices = asyncio.run(edge_tts.list_voices())
32
  voices_by_gender = {
33
  'female': [],
 
1
+ from utils import setup_logger
2
+
3
  import asyncio
4
  import edge_tts
5
  from io import BytesIO
6
 
7
 
8
+ logger = setup_logger(__name__)
9
+
10
+
11
  def speak(
12
  input_text: str,
13
  voice: str = 'en-US-AvaMultilingualNeural',
14
  output_duration: bool = False,
15
  correction_factor: float = 1.15 # Slightly increase the wait time (magic number with experimentation)
16
  ):
17
+ logger.debug('Speaking...')
18
  communicate = edge_tts.Communicate(input_text, voice)
19
  audio_stream = BytesIO()
20
 
 
34
 
35
 
36
  def get_voices():
37
+ logger.debug('Getting a list of voices')
38
  voices = asyncio.run(edge_tts.list_voices())
39
  voices_by_gender = {
40
  'female': [],
utils.py CHANGED
@@ -1,5 +1,7 @@
 
1
  from gender_guesser.detector import Detector
2
  from huggingface_hub import InferenceClient
 
3
  import os
4
  import random
5
  import re
@@ -97,3 +99,29 @@ def get_gender(name: str):
97
  if gender not in ['male', 'female']:
98
  gender = random.choice(['male', 'female'])
99
  return gender
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import colorlog
2
  from gender_guesser.detector import Detector
3
  from huggingface_hub import InferenceClient
4
+ import logging
5
  import os
6
  import random
7
  import re
 
99
  if gender not in ['male', 'female']:
100
  gender = random.choice(['male', 'female'])
101
  return gender
102
+
103
+
104
+ def setup_logger(
105
+ name: str,
106
+ level: int = logging.DEBUG
107
+ ) -> logging.Logger:
108
+ logger = logging.getLogger(name)
109
+ logger.setLevel(level)
110
+ if not logger.hasHandlers():
111
+ stream_handler = logging.StreamHandler()
112
+ stream_handler.setLevel(level)
113
+ formatter = colorlog.ColoredFormatter(
114
+ "%(log_color)s%(asctime)s.%(msecs)03d - %(name)s - %(levelname)s - %(message)s",
115
+ datefmt="%Y-%m-%d %H:%M:%S",
116
+ log_colors={
117
+ 'DEBUG': 'cyan',
118
+ 'INFO': 'green',
119
+ 'WARNING': 'yellow',
120
+ 'ERROR': 'red',
121
+ 'CRITICAL': 'bold_red',
122
+ }
123
+ )
124
+ stream_handler.setFormatter(formatter)
125
+ logger.addHandler(stream_handler)
126
+ logger.propagate = False
127
+ return logger