Spaces:
Runtime error
Runtime error
File size: 8,451 Bytes
73891ee 90f8e12 73891ee 1c04199 73891ee 2d9df18 73891ee 1c04199 73891ee 154e98e 73891ee 8d3e224 84e61b9 73891ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
from functools import lru_cache
import time, aiohttp, asyncio, json, os, multiprocessing, torch, \
requests, xmltodict, fitz, io
from minivectordb.embedding_model import EmbeddingModel
from minivectordb.vector_database import VectorDatabase
from text_util_en_pt.cleaner import structurize_text, detect_language, Language
import gradio as gr
torch.set_num_threads(2)
openrouter_key = os.environ.get("OPENROUTER_KEY")
model = EmbeddingModel(use_quantized_onnx_model=True)
def convert_xml_to_json(xml):
return xmltodict.parse(xml)
def clean_title(title):
title = title.replace('\n', ' ')
while ' ' in title:
title = title.replace(' ', ' ')
return title
@lru_cache(maxsize=500)
def fetch_arxiv_links(query, max_results=5):
url = f'http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}'
response = requests.get(url)
json_response = convert_xml_to_json(response.text)
# Return a list of titles and links, and pdf links
entries = []
for entry in json_response['feed']['entry']:
title = entry['title']
id = entry['id'].split('/abs/')[-1]
link = f'http://arxiv.org/abs/{id}'
pdf_link = f'http://arxiv.org/pdf/{id}.pdf'
entries.append({
'title': clean_title(title),
'link': link,
'pdf_link': pdf_link
})
return entries
def download_pdf_from_link(link):
# Download the file and hold it in memory
response = requests.get(link)
return io.BytesIO(response.content)
@lru_cache(maxsize=100)
def read_remote_pdf(pdf_metadata):
pdf_metadata = json.loads(pdf_metadata)
link = pdf_metadata['pdf_link']
title = pdf_metadata['title']
pdf_content = download_pdf_from_link(link)
pdf_file = fitz.open("pdf", pdf_content.read())
text_content = [page.get_text() for page in pdf_file]
pdf_file.close()
del pdf_file
return {'title': title, 'text': '\n'.join(text_content)}
def fetch_data_from_pdfs(links):
links = [ json.dumps(link) for link in links ]
with multiprocessing.Pool(10) as pool:
pdf_metadata = pool.map(read_remote_pdf, links)
return pdf_metadata
def index_and_search(query, pdf_metadata):
start = time.time()
query_embedding = model.extract_embeddings(query)
# Indexing
vector_db = VectorDatabase()
sentence_counter = 1
for pdf_data in pdf_metadata:
text = pdf_data['text']
title = pdf_data['title']
sentences = [ s['sentence'] for s in structurize_text(text)]
for sentence in sentences:
sentence_embedding = model.extract_embeddings(sentence)
vector_db.store_embedding(
sentence_counter,
sentence_embedding,
{
'sentence': sentence,
'title': title
}
)
sentence_counter += 1
embedding_time = time.time() - start
# Retrieval
start = time.time()
search_results = vector_db.find_most_similar(query_embedding, k = 15)
search_metadata = search_results[2]
retrieval_time = time.time() - start
retrieved_contents = {}
for ret_cont in search_metadata:
title = ret_cont['title']
if title not in retrieved_contents:
retrieved_contents[title] = []
retrieved_contents[title].append(ret_cont['sentence'])
retrieved_contents = {k: '\n'.join(v) for k, v in retrieved_contents.items() if len(v) > 2}
return retrieved_contents, embedding_time, retrieval_time
def retrieval_pipeline(query, question):
start = time.time()
links = fetch_arxiv_links(query)
websearch_time = time.time() - start
start = time.time()
pdf_metadata = fetch_data_from_pdfs(links)
webcrawl_time = time.time() - start
retrieved_contents, embedding_time, retrieval_time = index_and_search(question, pdf_metadata)
return retrieved_contents, websearch_time, webcrawl_time, embedding_time, retrieval_time, links
async def predict(message, history):
# message is in format: "Search: <query>; Question: <question>"
# we need to parse both parts into variables
message = message.split(';')
query = message[0].split(':')[-1].strip()
question = message[1].split(':')[-1].strip()
retrieved_contents, websearch_time, webcrawl_time, embedding_time, retrieval_time, links = retrieval_pipeline(query, question)
if detect_language(message) == Language.ptbr:
context = ""
for title, content in retrieved_contents.items():
context += f'Artigo "{title}"\nConteúdo:\n{content}\n\n'
prompt = f'{context.strip()}\n\nBaseado nos conteúdos dos artigos, responda: "{question}"\n\nPor favor, mencione a fonte da sua resposta.\nResponda somente em português brasileiro'
else:
context = ""
for title, content in retrieved_contents.items():
context += f'Article "{title}"\nContent:\n{content}\n\n'
prompt = f'{context.strip()}\n\nBased on the article\'s contents, answer: "{question}"\n\nPlease, mention the source of your answer.'
print(prompt)
url = "https://openrouter.ai/api/v1/chat/completions"
headers = { "Content-Type": "application/json",
"Authorization": f"Bearer {openrouter_key}" }
body = { "stream": True,
"model": "deepseek/deepseek-chat",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": prompt}
] }
full_response = ""
async with aiohttp.ClientSession() as session:
async with session.post(url, headers=headers, json=body) as response:
buffer = "" # A buffer to hold incomplete lines of data
async for chunk in response.content.iter_any():
buffer += chunk.decode()
while "\n" in buffer: # Process as long as there are complete lines in the buffer
line, buffer = buffer.split("\n", 1)
if line.startswith("data: "):
event_data = line[len("data: "):]
if event_data != '[DONE]':
try:
current_text = json.loads(event_data)['choices'][0]['delta']['content']
full_response += current_text
yield full_response
await asyncio.sleep(0.01)
except Exception:
try:
current_text = json.loads(event_data)['choices'][0]['text']
full_response += current_text
yield full_response
await asyncio.sleep(0.01)
except Exception:
pass
final_metadata_block = ""
final_metadata_block += f"Links visited:\n"
for link in links:
final_metadata_block += f"{link['title']} ({link['link']})\n"
final_metadata_block += f"\nWeb search time: {websearch_time:.4f} seconds\n"
final_metadata_block += f"\nText extraction: {webcrawl_time:.4f} seconds\n"
final_metadata_block += f"\nEmbedding time: {embedding_time:.4f} seconds\n"
final_metadata_block += f"\nRetrieval from VectorDB time: {retrieval_time:.4f} seconds"
yield f"{full_response}\n\n{final_metadata_block}"
gr.ChatInterface(
predict,
title="Automated Arxiv Paper Search and Question Answering",
description="Provide a search term and a question to find relevant papers and answer questions about them.",
retry_btn=None,
undo_btn=None,
examples=[
'Search: RAG LLM; Question: What are some challenges of implementing a system of RAG with LLMs?',
'Search: LLM Self-Play; Question: What are the benefits of using self-play with LLMs?',
'Search: Portable Blockchain; Question: How can a portable blockchain device be implemented?',
'Search: 1.58 bit LLMs; Question: How do 1.58 bit LLMs work? Is there an available model to test?',
'Search: Programação Robocode; Question: Como posso utilizar o robocode no contexto de aprendizagem de programação?',
'Search: Pensamento Computacional; Question: Explique os conceitos do pensamento computacional.'
]
).launch() |