Spaces:
Runtime error
Runtime error
added console logs
Browse files- app.py +3 -1
- doc_parser.py +6 -1
- generate_qa.py +13 -3
- requirements.txt +1 -0
- tts.py +7 -0
- utils.py +28 -0
app.py
CHANGED
|
@@ -14,7 +14,7 @@ from generate_qa import (
|
|
| 14 |
generate_outro,
|
| 15 |
)
|
| 16 |
from tts import speak, get_voices
|
| 17 |
-
from utils import get_config, get_gender
|
| 18 |
|
| 19 |
from bs4 import BeautifulSoup
|
| 20 |
import gradio as gr
|
|
@@ -25,6 +25,7 @@ import uuid
|
|
| 25 |
|
| 26 |
|
| 27 |
CONFIG = get_config()
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def update_subcategory_dropdown(selection):
|
|
@@ -33,6 +34,7 @@ def update_subcategory_dropdown(selection):
|
|
| 33 |
id_title_list = get_papers_by_category() # Global variable to store the list of papers, no time left to make this right
|
| 34 |
|
| 35 |
def fetch_and_update_paper_list(category, subcategory):
|
|
|
|
| 36 |
global id_title_list
|
| 37 |
subcategory_id = [key for key, value in arxiv_categories[category].items() if value == subcategory][0]
|
| 38 |
id_title_list = get_papers_by_category(category=subcategory_id)
|
|
|
|
| 14 |
generate_outro,
|
| 15 |
)
|
| 16 |
from tts import speak, get_voices
|
| 17 |
+
from utils import get_config, get_gender, setup_logger
|
| 18 |
|
| 19 |
from bs4 import BeautifulSoup
|
| 20 |
import gradio as gr
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
CONFIG = get_config()
|
| 28 |
+
logger = setup_logger(__name__)
|
| 29 |
|
| 30 |
|
| 31 |
def update_subcategory_dropdown(selection):
|
|
|
|
| 34 |
id_title_list = get_papers_by_category() # Global variable to store the list of papers, no time left to make this right
|
| 35 |
|
| 36 |
def fetch_and_update_paper_list(category, subcategory):
|
| 37 |
+
logger.debug(f'Updating the papers list to {category} -> {subcategory}')
|
| 38 |
global id_title_list
|
| 39 |
subcategory_id = [key for key, value in arxiv_categories[category].items() if value == subcategory][0]
|
| 40 |
id_title_list = get_papers_by_category(category=subcategory_id)
|
doc_parser.py
CHANGED
|
@@ -3,10 +3,11 @@ from bs4 import BeautifulSoup
|
|
| 3 |
import requests
|
| 4 |
from collections import OrderedDict
|
| 5 |
|
| 6 |
-
from utils import get_config
|
| 7 |
|
| 8 |
|
| 9 |
CONFIG = get_config()
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def get_papers_by_category(
|
|
@@ -16,6 +17,7 @@ def get_papers_by_category(
|
|
| 16 |
"""
|
| 17 |
Returns the IDs of the most recent papers from a specific category
|
| 18 |
"""
|
|
|
|
| 19 |
search = arxiv.Search(
|
| 20 |
query=f"cat:{category}",
|
| 21 |
max_results=max_results,
|
|
@@ -30,6 +32,7 @@ def get_paper_by_id(id: str, local_dir: str = './', filename: str = 'temp.pdf'):
|
|
| 30 |
"""
|
| 31 |
Downloads a paper by a given ID as PDF
|
| 32 |
"""
|
|
|
|
| 33 |
client = arxiv.Client()
|
| 34 |
paper = next(client.results(arxiv.Search(id_list=[id])))
|
| 35 |
paper.download_pdf(dirpath=local_dir, filename=filename)
|
|
@@ -41,6 +44,7 @@ def parse_pdf(
|
|
| 41 |
file_path: str,
|
| 42 |
endpoint: str = CONFIG.get("grobid").get("endpoint")
|
| 43 |
) -> str:
|
|
|
|
| 44 |
f = {'input': open(file_path, 'rb')}
|
| 45 |
response = requests.post(
|
| 46 |
endpoint,
|
|
@@ -55,6 +59,7 @@ def parse_xml(soup_obj: BeautifulSoup) -> OrderedDict:
|
|
| 55 |
and returns a dict with section titles as keys and content
|
| 56 |
of paragraphs as values
|
| 57 |
"""
|
|
|
|
| 58 |
divs = soup_obj.find_all('div')
|
| 59 |
|
| 60 |
content_dict = OrderedDict()
|
|
|
|
| 3 |
import requests
|
| 4 |
from collections import OrderedDict
|
| 5 |
|
| 6 |
+
from utils import get_config, setup_logger
|
| 7 |
|
| 8 |
|
| 9 |
CONFIG = get_config()
|
| 10 |
+
logger = setup_logger(__name__)
|
| 11 |
|
| 12 |
|
| 13 |
def get_papers_by_category(
|
|
|
|
| 17 |
"""
|
| 18 |
Returns the IDs of the most recent papers from a specific category
|
| 19 |
"""
|
| 20 |
+
logger.debug(f'Searching for {max_results} papers in {category} category')
|
| 21 |
search = arxiv.Search(
|
| 22 |
query=f"cat:{category}",
|
| 23 |
max_results=max_results,
|
|
|
|
| 32 |
"""
|
| 33 |
Downloads a paper by a given ID as PDF
|
| 34 |
"""
|
| 35 |
+
logger.debug(f'Downloading paper {id} to {local_dir}/{filename}')
|
| 36 |
client = arxiv.Client()
|
| 37 |
paper = next(client.results(arxiv.Search(id_list=[id])))
|
| 38 |
paper.download_pdf(dirpath=local_dir, filename=filename)
|
|
|
|
| 44 |
file_path: str,
|
| 45 |
endpoint: str = CONFIG.get("grobid").get("endpoint")
|
| 46 |
) -> str:
|
| 47 |
+
logger.debug('Parsing the PDF')
|
| 48 |
f = {'input': open(file_path, 'rb')}
|
| 49 |
response = requests.post(
|
| 50 |
endpoint,
|
|
|
|
| 59 |
and returns a dict with section titles as keys and content
|
| 60 |
of paragraphs as values
|
| 61 |
"""
|
| 62 |
+
logger.debug('Parsing the XML sections')
|
| 63 |
divs = soup_obj.find_all('div')
|
| 64 |
|
| 65 |
content_dict = OrderedDict()
|
generate_qa.py
CHANGED
|
@@ -3,15 +3,18 @@ from collections import OrderedDict
|
|
| 3 |
from utils import (
|
| 4 |
get_config,
|
| 5 |
HuggingFaceLLM,
|
| 6 |
-
text_cleanup
|
|
|
|
| 7 |
)
|
| 8 |
|
| 9 |
|
| 10 |
PROMPTS = get_config('prompts.yaml')
|
| 11 |
llm = HuggingFaceLLM()
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
def generate_summary(sections_dict: OrderedDict, intro_only: bool = True):
|
|
|
|
| 15 |
global llm
|
| 16 |
sections = list(sections_dict.keys())[:1] if intro_only else list(sections_dict.keys())
|
| 17 |
for section_title in sections:
|
|
@@ -35,6 +38,7 @@ def generate_question(paragraph: str, last_answer: str):
|
|
| 35 |
return
|
| 36 |
if question.count('?') > 1:
|
| 37 |
question = question.split('?')[0] + '?'
|
|
|
|
| 38 |
return text_cleanup(question)
|
| 39 |
|
| 40 |
|
|
@@ -48,10 +52,12 @@ def generate_answer(paragraph: str, question: str, title: str):
|
|
| 48 |
) + paragraph,
|
| 49 |
system_prompt=PROMPTS['guest']['sections']['system_prompt']
|
| 50 |
)
|
|
|
|
| 51 |
return text_cleanup(answer)
|
| 52 |
|
| 53 |
|
| 54 |
def process_qa_chain(sections_dict: OrderedDict, paper_title):
|
|
|
|
| 55 |
section_keys = list(sections_dict.keys())[1:]
|
| 56 |
answer = ''
|
| 57 |
for section in section_keys:
|
|
@@ -60,14 +66,13 @@ def process_qa_chain(sections_dict: OrderedDict, paper_title):
|
|
| 60 |
question = generate_question(paragraph, answer)
|
| 61 |
if question:
|
| 62 |
yield question
|
| 63 |
-
print(f"HOST: {question}")
|
| 64 |
answer = generate_answer(paragraph, question, paper_title)
|
| 65 |
if answer:
|
| 66 |
-
print(f"GUEST: {answer}")
|
| 67 |
yield answer
|
| 68 |
|
| 69 |
|
| 70 |
def generate_intro(author: str, title: str):
|
|
|
|
| 71 |
global llm
|
| 72 |
host_intro = llm.call(
|
| 73 |
system_prompt=PROMPTS['host']['intro']['system_prompt'],
|
|
@@ -76,6 +81,7 @@ def generate_intro(author: str, title: str):
|
|
| 76 |
title=title
|
| 77 |
)
|
| 78 |
)
|
|
|
|
| 79 |
yield text_cleanup(host_intro)
|
| 80 |
guest_intro = llm.call(
|
| 81 |
system_prompt=PROMPTS['guest']['intro']['system_prompt'].format(
|
|
@@ -86,6 +92,7 @@ def generate_intro(author: str, title: str):
|
|
| 86 |
host_intro=host_intro
|
| 87 |
)
|
| 88 |
)
|
|
|
|
| 89 |
yield text_cleanup(guest_intro)
|
| 90 |
|
| 91 |
|
|
@@ -94,6 +101,7 @@ def generate_intro_summary_question():
|
|
| 94 |
|
| 95 |
|
| 96 |
def generate_outro(author: str, title: str):
|
|
|
|
| 97 |
global llm
|
| 98 |
host_outro = llm.call(
|
| 99 |
system_prompt=PROMPTS['host']['outro']['system_prompt'],
|
|
@@ -102,6 +110,7 @@ def generate_outro(author: str, title: str):
|
|
| 102 |
title=title
|
| 103 |
)
|
| 104 |
)
|
|
|
|
| 105 |
yield text_cleanup(host_outro)
|
| 106 |
guest_outro = llm.call(
|
| 107 |
system_prompt=PROMPTS['guest']['outro']['system_prompt'].format(
|
|
@@ -112,4 +121,5 @@ def generate_outro(author: str, title: str):
|
|
| 112 |
host_outro=host_outro
|
| 113 |
)
|
| 114 |
)
|
|
|
|
| 115 |
yield text_cleanup(guest_outro)
|
|
|
|
| 3 |
from utils import (
|
| 4 |
get_config,
|
| 5 |
HuggingFaceLLM,
|
| 6 |
+
text_cleanup,
|
| 7 |
+
setup_logger,
|
| 8 |
)
|
| 9 |
|
| 10 |
|
| 11 |
PROMPTS = get_config('prompts.yaml')
|
| 12 |
llm = HuggingFaceLLM()
|
| 13 |
+
logger = setup_logger(__name__)
|
| 14 |
|
| 15 |
|
| 16 |
def generate_summary(sections_dict: OrderedDict, intro_only: bool = True):
|
| 17 |
+
logger.debug('Generating paper summary')
|
| 18 |
global llm
|
| 19 |
sections = list(sections_dict.keys())[:1] if intro_only else list(sections_dict.keys())
|
| 20 |
for section_title in sections:
|
|
|
|
| 38 |
return
|
| 39 |
if question.count('?') > 1:
|
| 40 |
question = question.split('?')[0] + '?'
|
| 41 |
+
logger.debug(f"HOST: {question[:100]}...{question[-100:]}")
|
| 42 |
return text_cleanup(question)
|
| 43 |
|
| 44 |
|
|
|
|
| 52 |
) + paragraph,
|
| 53 |
system_prompt=PROMPTS['guest']['sections']['system_prompt']
|
| 54 |
)
|
| 55 |
+
logger.debug(f"GUEST: {answer[:100]}...{answer[-100:]}")
|
| 56 |
return text_cleanup(answer)
|
| 57 |
|
| 58 |
|
| 59 |
def process_qa_chain(sections_dict: OrderedDict, paper_title):
|
| 60 |
+
logger.debug('Generating Q&A pairs')
|
| 61 |
section_keys = list(sections_dict.keys())[1:]
|
| 62 |
answer = ''
|
| 63 |
for section in section_keys:
|
|
|
|
| 66 |
question = generate_question(paragraph, answer)
|
| 67 |
if question:
|
| 68 |
yield question
|
|
|
|
| 69 |
answer = generate_answer(paragraph, question, paper_title)
|
| 70 |
if answer:
|
|
|
|
| 71 |
yield answer
|
| 72 |
|
| 73 |
|
| 74 |
def generate_intro(author: str, title: str):
|
| 75 |
+
logger.debug('Generating podcast introduction')
|
| 76 |
global llm
|
| 77 |
host_intro = llm.call(
|
| 78 |
system_prompt=PROMPTS['host']['intro']['system_prompt'],
|
|
|
|
| 81 |
title=title
|
| 82 |
)
|
| 83 |
)
|
| 84 |
+
logger.debug(f"HOST: {host_intro[:100]}...{host_intro[-100:]}")
|
| 85 |
yield text_cleanup(host_intro)
|
| 86 |
guest_intro = llm.call(
|
| 87 |
system_prompt=PROMPTS['guest']['intro']['system_prompt'].format(
|
|
|
|
| 92 |
host_intro=host_intro
|
| 93 |
)
|
| 94 |
)
|
| 95 |
+
logger.debug(f"GUEST: {guest_intro[:100]}...{guest_intro[-100:]}")
|
| 96 |
yield text_cleanup(guest_intro)
|
| 97 |
|
| 98 |
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
def generate_outro(author: str, title: str):
|
| 104 |
+
logger.debug('Generating podcast outro')
|
| 105 |
global llm
|
| 106 |
host_outro = llm.call(
|
| 107 |
system_prompt=PROMPTS['host']['outro']['system_prompt'],
|
|
|
|
| 110 |
title=title
|
| 111 |
)
|
| 112 |
)
|
| 113 |
+
logger.debug(f"HOST: {host_outro[:100]}...{host_outro[-100:]}")
|
| 114 |
yield text_cleanup(host_outro)
|
| 115 |
guest_outro = llm.call(
|
| 116 |
system_prompt=PROMPTS['guest']['outro']['system_prompt'].format(
|
|
|
|
| 121 |
host_outro=host_outro
|
| 122 |
)
|
| 123 |
)
|
| 124 |
+
logger.debug(f"GUEST: {guest_outro[:100]}...{guest_outro[-100:]}")
|
| 125 |
yield text_cleanup(guest_outro)
|
requirements.txt
CHANGED
|
@@ -10,3 +10,4 @@ lxml==5.3.0
|
|
| 10 |
bs4==0.0.2
|
| 11 |
arxiv==2.1.3
|
| 12 |
gender-guesser==0.4.0
|
|
|
|
|
|
| 10 |
bs4==0.0.2
|
| 11 |
arxiv==2.1.3
|
| 12 |
gender-guesser==0.4.0
|
| 13 |
+
colorlog==6.8.2
|
tts.py
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
|
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import edge_tts
|
| 3 |
from io import BytesIO
|
| 4 |
|
| 5 |
|
|
|
|
|
|
|
|
|
|
| 6 |
def speak(
|
| 7 |
input_text: str,
|
| 8 |
voice: str = 'en-US-AvaMultilingualNeural',
|
| 9 |
output_duration: bool = False,
|
| 10 |
correction_factor: float = 1.15 # Slightly increase the wait time (magic number with experimentation)
|
| 11 |
):
|
|
|
|
| 12 |
communicate = edge_tts.Communicate(input_text, voice)
|
| 13 |
audio_stream = BytesIO()
|
| 14 |
|
|
@@ -28,6 +34,7 @@ def speak(
|
|
| 28 |
|
| 29 |
|
| 30 |
def get_voices():
|
|
|
|
| 31 |
voices = asyncio.run(edge_tts.list_voices())
|
| 32 |
voices_by_gender = {
|
| 33 |
'female': [],
|
|
|
|
| 1 |
+
from utils import setup_logger
|
| 2 |
+
|
| 3 |
import asyncio
|
| 4 |
import edge_tts
|
| 5 |
from io import BytesIO
|
| 6 |
|
| 7 |
|
| 8 |
+
logger = setup_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
def speak(
|
| 12 |
input_text: str,
|
| 13 |
voice: str = 'en-US-AvaMultilingualNeural',
|
| 14 |
output_duration: bool = False,
|
| 15 |
correction_factor: float = 1.15 # Slightly increase the wait time (magic number with experimentation)
|
| 16 |
):
|
| 17 |
+
logger.debug('Speaking...')
|
| 18 |
communicate = edge_tts.Communicate(input_text, voice)
|
| 19 |
audio_stream = BytesIO()
|
| 20 |
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
def get_voices():
|
| 37 |
+
logger.debug('Getting a list of voices')
|
| 38 |
voices = asyncio.run(edge_tts.list_voices())
|
| 39 |
voices_by_gender = {
|
| 40 |
'female': [],
|
utils.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
|
|
| 1 |
from gender_guesser.detector import Detector
|
| 2 |
from huggingface_hub import InferenceClient
|
|
|
|
| 3 |
import os
|
| 4 |
import random
|
| 5 |
import re
|
|
@@ -97,3 +99,29 @@ def get_gender(name: str):
|
|
| 97 |
if gender not in ['male', 'female']:
|
| 98 |
gender = random.choice(['male', 'female'])
|
| 99 |
return gender
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import colorlog
|
| 2 |
from gender_guesser.detector import Detector
|
| 3 |
from huggingface_hub import InferenceClient
|
| 4 |
+
import logging
|
| 5 |
import os
|
| 6 |
import random
|
| 7 |
import re
|
|
|
|
| 99 |
if gender not in ['male', 'female']:
|
| 100 |
gender = random.choice(['male', 'female'])
|
| 101 |
return gender
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def setup_logger(
|
| 105 |
+
name: str,
|
| 106 |
+
level: int = logging.DEBUG
|
| 107 |
+
) -> logging.Logger:
|
| 108 |
+
logger = logging.getLogger(name)
|
| 109 |
+
logger.setLevel(level)
|
| 110 |
+
if not logger.hasHandlers():
|
| 111 |
+
stream_handler = logging.StreamHandler()
|
| 112 |
+
stream_handler.setLevel(level)
|
| 113 |
+
formatter = colorlog.ColoredFormatter(
|
| 114 |
+
"%(log_color)s%(asctime)s.%(msecs)03d - %(name)s - %(levelname)s - %(message)s",
|
| 115 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 116 |
+
log_colors={
|
| 117 |
+
'DEBUG': 'cyan',
|
| 118 |
+
'INFO': 'green',
|
| 119 |
+
'WARNING': 'yellow',
|
| 120 |
+
'ERROR': 'red',
|
| 121 |
+
'CRITICAL': 'bold_red',
|
| 122 |
+
}
|
| 123 |
+
)
|
| 124 |
+
stream_handler.setFormatter(formatter)
|
| 125 |
+
logger.addHandler(stream_handler)
|
| 126 |
+
logger.propagate = False
|
| 127 |
+
return logger
|