Spaces:

paper-whisperer
/

paper-whisperer

Runtime error

App Files Files Community

svilens commited on Oct 28, 2024

Commit

e6fe32a

1 Parent(s): ed4e4ea

added console logs

Browse files

Files changed (6) hide show

app.py +3 -1
doc_parser.py +6 -1
generate_qa.py +13 -3
requirements.txt +1 -0
tts.py +7 -0
utils.py +28 -0

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from generate_qa import (
     generate_outro,
 )
 from tts import speak, get_voices
-from utils import get_config, get_gender
 from bs4 import BeautifulSoup
 import gradio as gr
@@ -25,6 +25,7 @@ import uuid
 CONFIG = get_config()
 def update_subcategory_dropdown(selection):
@@ -33,6 +34,7 @@ def update_subcategory_dropdown(selection):
 id_title_list = get_papers_by_category()  # Global variable to store the list of papers, no time left to make this right
 def fetch_and_update_paper_list(category, subcategory):
     global id_title_list
     subcategory_id = [key for key, value in arxiv_categories[category].items() if value == subcategory][0]
     id_title_list = get_papers_by_category(category=subcategory_id)

     generate_outro,
 )
 from tts import speak, get_voices
+from utils import get_config, get_gender, setup_logger
 from bs4 import BeautifulSoup
 import gradio as gr
 CONFIG = get_config()
+logger = setup_logger(__name__)
 def update_subcategory_dropdown(selection):
 id_title_list = get_papers_by_category()  # Global variable to store the list of papers, no time left to make this right
 def fetch_and_update_paper_list(category, subcategory):
+    logger.debug(f'Updating the papers list to {category} -> {subcategory}')
     global id_title_list
     subcategory_id = [key for key, value in arxiv_categories[category].items() if value == subcategory][0]
     id_title_list = get_papers_by_category(category=subcategory_id)

doc_parser.py CHANGED Viewed

@@ -3,10 +3,11 @@ from bs4 import BeautifulSoup
 import requests
 from collections import OrderedDict
-from utils import get_config
 CONFIG = get_config()
 def get_papers_by_category(
@@ -16,6 +17,7 @@ def get_papers_by_category(
     """
     Returns the IDs of the most recent papers from a specific category
     """
     search = arxiv.Search(
         query=f"cat:{category}",
         max_results=max_results,
@@ -30,6 +32,7 @@ def get_paper_by_id(id: str, local_dir: str = './', filename: str = 'temp.pdf'):
     """
     Downloads a paper by a given ID as PDF
     """
     client = arxiv.Client()
     paper = next(client.results(arxiv.Search(id_list=[id])))
     paper.download_pdf(dirpath=local_dir, filename=filename)
@@ -41,6 +44,7 @@ def parse_pdf(
     file_path: str,
     endpoint: str = CONFIG.get("grobid").get("endpoint")
 ) -> str:
     f = {'input': open(file_path, 'rb')}
     response = requests.post(
         endpoint,
@@ -55,6 +59,7 @@ def parse_xml(soup_obj: BeautifulSoup) -> OrderedDict:
     and returns a dict with section titles as keys and content
     of paragraphs as values
     """
     divs = soup_obj.find_all('div')
     content_dict = OrderedDict()

 import requests
 from collections import OrderedDict
+from utils import get_config, setup_logger
 CONFIG = get_config()
+logger = setup_logger(__name__)
 def get_papers_by_category(
     """
     Returns the IDs of the most recent papers from a specific category
     """
+    logger.debug(f'Searching for {max_results} papers in {category} category')
     search = arxiv.Search(
         query=f"cat:{category}",
         max_results=max_results,
     """
     Downloads a paper by a given ID as PDF
     """
+    logger.debug(f'Downloading paper {id} to {local_dir}/{filename}')
     client = arxiv.Client()
     paper = next(client.results(arxiv.Search(id_list=[id])))
     paper.download_pdf(dirpath=local_dir, filename=filename)
     file_path: str,
     endpoint: str = CONFIG.get("grobid").get("endpoint")
 ) -> str:
+    logger.debug('Parsing the PDF')
     f = {'input': open(file_path, 'rb')}
     response = requests.post(
         endpoint,
     and returns a dict with section titles as keys and content
     of paragraphs as values
     """
+    logger.debug('Parsing the XML sections')
     divs = soup_obj.find_all('div')
     content_dict = OrderedDict()

generate_qa.py CHANGED Viewed

@@ -3,15 +3,18 @@ from collections import OrderedDict
 from utils import (
     get_config,
     HuggingFaceLLM,
-    text_cleanup
 )
 PROMPTS = get_config('prompts.yaml')
 llm = HuggingFaceLLM()
 def generate_summary(sections_dict: OrderedDict, intro_only: bool = True):
     global llm
     sections = list(sections_dict.keys())[:1] if intro_only else list(sections_dict.keys())
     for section_title in sections:
@@ -35,6 +38,7 @@ def generate_question(paragraph: str, last_answer: str):
         return
     if question.count('?') > 1:
         question = question.split('?')[0] + '?'
     return text_cleanup(question)
@@ -48,10 +52,12 @@ def generate_answer(paragraph: str, question: str, title: str):
         ) + paragraph,
         system_prompt=PROMPTS['guest']['sections']['system_prompt']
     )
     return text_cleanup(answer)
 def process_qa_chain(sections_dict: OrderedDict, paper_title):
     section_keys = list(sections_dict.keys())[1:]
     answer = ''
     for section in section_keys:
@@ -60,14 +66,13 @@ def process_qa_chain(sections_dict: OrderedDict, paper_title):
             question = generate_question(paragraph, answer)
             if question:
                 yield question
-                print(f"HOST: {question}")
                 answer = generate_answer(paragraph, question, paper_title)
                 if answer:
-                    print(f"GUEST: {answer}")
                     yield answer
 def generate_intro(author: str, title: str):
     global llm
     host_intro = llm.call(
         system_prompt=PROMPTS['host']['intro']['system_prompt'],
@@ -76,6 +81,7 @@ def generate_intro(author: str, title: str):
             title=title
         )
     )
     yield text_cleanup(host_intro)
     guest_intro = llm.call(
         system_prompt=PROMPTS['guest']['intro']['system_prompt'].format(
@@ -86,6 +92,7 @@ def generate_intro(author: str, title: str):
             host_intro=host_intro
         )
     )
     yield text_cleanup(guest_intro)
@@ -94,6 +101,7 @@ def generate_intro_summary_question():
 def generate_outro(author: str, title: str):
     global llm
     host_outro = llm.call(
         system_prompt=PROMPTS['host']['outro']['system_prompt'],
@@ -102,6 +110,7 @@ def generate_outro(author: str, title: str):
             title=title
         )
     )
     yield text_cleanup(host_outro)
     guest_outro = llm.call(
         system_prompt=PROMPTS['guest']['outro']['system_prompt'].format(
@@ -112,4 +121,5 @@ def generate_outro(author: str, title: str):
             host_outro=host_outro
         )
     )
     yield text_cleanup(guest_outro)

 from utils import (
     get_config,
     HuggingFaceLLM,
+    text_cleanup,
+    setup_logger,
 )
 PROMPTS = get_config('prompts.yaml')
 llm = HuggingFaceLLM()
+logger = setup_logger(__name__)
 def generate_summary(sections_dict: OrderedDict, intro_only: bool = True):
+    logger.debug('Generating paper summary')
     global llm
     sections = list(sections_dict.keys())[:1] if intro_only else list(sections_dict.keys())
     for section_title in sections:
         return
     if question.count('?') > 1:
         question = question.split('?')[0] + '?'
+    logger.debug(f"HOST: {question[:100]}...{question[-100:]}")
     return text_cleanup(question)
         ) + paragraph,
         system_prompt=PROMPTS['guest']['sections']['system_prompt']
     )
+    logger.debug(f"GUEST: {answer[:100]}...{answer[-100:]}")
     return text_cleanup(answer)
 def process_qa_chain(sections_dict: OrderedDict, paper_title):
+    logger.debug('Generating Q&A pairs')
     section_keys = list(sections_dict.keys())[1:]
     answer = ''
     for section in section_keys:
             question = generate_question(paragraph, answer)
             if question:
                 yield question
                 answer = generate_answer(paragraph, question, paper_title)
                 if answer:
                     yield answer
 def generate_intro(author: str, title: str):
+    logger.debug('Generating podcast introduction')
     global llm
     host_intro = llm.call(
         system_prompt=PROMPTS['host']['intro']['system_prompt'],
             title=title
         )
     )
+    logger.debug(f"HOST: {host_intro[:100]}...{host_intro[-100:]}")
     yield text_cleanup(host_intro)
     guest_intro = llm.call(
         system_prompt=PROMPTS['guest']['intro']['system_prompt'].format(
             host_intro=host_intro
         )
     )
+    logger.debug(f"GUEST: {guest_intro[:100]}...{guest_intro[-100:]}")
     yield text_cleanup(guest_intro)
 def generate_outro(author: str, title: str):
+    logger.debug('Generating podcast outro')
     global llm
     host_outro = llm.call(
         system_prompt=PROMPTS['host']['outro']['system_prompt'],
             title=title
         )
     )
+    logger.debug(f"HOST: {host_outro[:100]}...{host_outro[-100:]}")
     yield text_cleanup(host_outro)
     guest_outro = llm.call(
         system_prompt=PROMPTS['guest']['outro']['system_prompt'].format(
             host_outro=host_outro
         )
     )
+    logger.debug(f"GUEST: {guest_outro[:100]}...{guest_outro[-100:]}")
     yield text_cleanup(guest_outro)

requirements.txt CHANGED Viewed

@@ -10,3 +10,4 @@ lxml==5.3.0
 bs4==0.0.2
 arxiv==2.1.3
 gender-guesser==0.4.0

 bs4==0.0.2
 arxiv==2.1.3
 gender-guesser==0.4.0
+colorlog==6.8.2

tts.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import asyncio
 import edge_tts
 from io import BytesIO
 def speak(
     input_text: str,
     voice: str = 'en-US-AvaMultilingualNeural',
     output_duration: bool = False,
     correction_factor: float = 1.15  # Slightly increase the wait time (magic number with experimentation)
 ):
     communicate = edge_tts.Communicate(input_text, voice)
     audio_stream = BytesIO()
@@ -28,6 +34,7 @@ def speak(
 def get_voices():
     voices = asyncio.run(edge_tts.list_voices())
     voices_by_gender = {
         'female': [],

+from utils import setup_logger
 import asyncio
 import edge_tts
 from io import BytesIO
+logger = setup_logger(__name__)
 def speak(
     input_text: str,
     voice: str = 'en-US-AvaMultilingualNeural',
     output_duration: bool = False,
     correction_factor: float = 1.15  # Slightly increase the wait time (magic number with experimentation)
 ):
+    logger.debug('Speaking...')
     communicate = edge_tts.Communicate(input_text, voice)
     audio_stream = BytesIO()
 def get_voices():
+    logger.debug('Getting a list of voices')
     voices = asyncio.run(edge_tts.list_voices())
     voices_by_gender = {
         'female': [],

utils.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from gender_guesser.detector import Detector
 from huggingface_hub import InferenceClient
 import os
 import random
 import re
@@ -97,3 +99,29 @@ def get_gender(name: str):
     if gender not in ['male', 'female']:
         gender = random.choice(['male', 'female'])
     return gender

+import colorlog
 from gender_guesser.detector import Detector
 from huggingface_hub import InferenceClient
+import logging
 import os
 import random
 import re
     if gender not in ['male', 'female']:
         gender = random.choice(['male', 'female'])
     return gender
+def setup_logger(
+    name: str,
+    level: int = logging.DEBUG
+) -> logging.Logger:
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    if not logger.hasHandlers():
+        stream_handler = logging.StreamHandler()
+        stream_handler.setLevel(level)
+        formatter = colorlog.ColoredFormatter(
+            "%(log_color)s%(asctime)s.%(msecs)03d - %(name)s - %(levelname)s - %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+            log_colors={
+                'DEBUG': 'cyan',
+                'INFO': 'green',
+                'WARNING': 'yellow',
+                'ERROR': 'red',
+                'CRITICAL': 'bold_red',
+            }
+        )
+        stream_handler.setFormatter(formatter)
+        logger.addHandler(stream_handler)
+    logger.propagate = False
+    return logger