File size: 8,451 Bytes
73891ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90f8e12
73891ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c04199
73891ee
 
 
 
 
 
 
 
 
 
 
 
 
 
2d9df18
73891ee
 
 
 
1c04199
73891ee
 
 
 
 
 
 
154e98e
73891ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d3e224
 
 
84e61b9
 
 
73891ee
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
from functools import lru_cache
import time, aiohttp, asyncio, json, os, multiprocessing, torch, \
    requests, xmltodict, fitz, io
from minivectordb.embedding_model import EmbeddingModel
from minivectordb.vector_database import VectorDatabase
from text_util_en_pt.cleaner import structurize_text, detect_language, Language
import gradio as gr

torch.set_num_threads(2)

openrouter_key = os.environ.get("OPENROUTER_KEY")
model = EmbeddingModel(use_quantized_onnx_model=True)

def convert_xml_to_json(xml):
    return xmltodict.parse(xml)

def clean_title(title):
    title = title.replace('\n', ' ')
    while '  ' in title:
        title = title.replace('  ', ' ')
    return title

@lru_cache(maxsize=500)
def fetch_arxiv_links(query, max_results=5):
    url = f'http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}'
    response = requests.get(url)
    json_response = convert_xml_to_json(response.text)

    # Return a list of titles and links, and pdf links
    entries = []
    for entry in json_response['feed']['entry']:

        title = entry['title']
        id = entry['id'].split('/abs/')[-1]

        link = f'http://arxiv.org/abs/{id}'
        pdf_link = f'http://arxiv.org/pdf/{id}.pdf'

        entries.append({
            'title': clean_title(title),
            'link': link,
            'pdf_link': pdf_link
        })
    return entries

def download_pdf_from_link(link):
    # Download the file and hold it in memory
    response = requests.get(link)
    return io.BytesIO(response.content)

@lru_cache(maxsize=100)
def read_remote_pdf(pdf_metadata):
    pdf_metadata = json.loads(pdf_metadata)

    link = pdf_metadata['pdf_link']
    title = pdf_metadata['title']

    pdf_content = download_pdf_from_link(link)
    pdf_file = fitz.open("pdf", pdf_content.read())
    text_content = [page.get_text() for page in pdf_file]
    pdf_file.close()
    del pdf_file
    return {'title': title, 'text': '\n'.join(text_content)}

def fetch_data_from_pdfs(links):
    links = [ json.dumps(link) for link in links ]
    with multiprocessing.Pool(10) as pool:
        pdf_metadata = pool.map(read_remote_pdf, links)
    return pdf_metadata

def index_and_search(query, pdf_metadata):
    start = time.time()
    query_embedding = model.extract_embeddings(query)

    # Indexing
    vector_db = VectorDatabase()

    sentence_counter = 1

    for pdf_data in pdf_metadata:
        text = pdf_data['text']
        title = pdf_data['title']

        sentences = [ s['sentence'] for s in structurize_text(text)]

        for sentence in sentences:
            sentence_embedding = model.extract_embeddings(sentence)
            vector_db.store_embedding(
                sentence_counter,
                sentence_embedding,
                {
                    'sentence': sentence,
                    'title': title
                }
            )
            sentence_counter += 1
    
    embedding_time = time.time() - start

    # Retrieval
    start = time.time()
    search_results = vector_db.find_most_similar(query_embedding, k = 15)
    search_metadata = search_results[2]
    retrieval_time = time.time() - start

    retrieved_contents = {}
    for ret_cont in search_metadata:
        title = ret_cont['title']
        if title not in retrieved_contents:
            retrieved_contents[title] = []
        retrieved_contents[title].append(ret_cont['sentence'])
    
    retrieved_contents = {k: '\n'.join(v) for k, v in retrieved_contents.items() if len(v) > 2}

    return retrieved_contents, embedding_time, retrieval_time

def retrieval_pipeline(query, question):
    start = time.time()
    links = fetch_arxiv_links(query)
    websearch_time = time.time() - start

    start = time.time()
    pdf_metadata = fetch_data_from_pdfs(links)
    webcrawl_time = time.time() - start

    retrieved_contents, embedding_time, retrieval_time = index_and_search(question, pdf_metadata)

    return retrieved_contents, websearch_time, webcrawl_time, embedding_time, retrieval_time, links


async def predict(message, history):
    # message is in format: "Search: <query>; Question: <question>"
    # we need to parse both parts into variables
    message = message.split(';')

    query = message[0].split(':')[-1].strip()
    question = message[1].split(':')[-1].strip()

    retrieved_contents, websearch_time, webcrawl_time, embedding_time, retrieval_time, links = retrieval_pipeline(query, question)

    if detect_language(message) == Language.ptbr:
        context = ""
        for title, content in retrieved_contents.items():
            context += f'Artigo "{title}"\nConteúdo:\n{content}\n\n'
        prompt = f'{context.strip()}\n\nBaseado nos conteúdos dos artigos, responda: "{question}"\n\nPor favor, mencione a fonte da sua resposta.\nResponda somente em português brasileiro'
    else:
        context = ""
        for title, content in retrieved_contents.items():
            context += f'Article "{title}"\nContent:\n{content}\n\n'
        prompt = f'{context.strip()}\n\nBased on the article\'s contents, answer: "{question}"\n\nPlease, mention the source of your answer.'

    print(prompt)

    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = { "Content-Type": "application/json",
                "Authorization": f"Bearer {openrouter_key}" }
    body = { "stream": True,
             "model": "deepseek/deepseek-chat",
             "max_tokens": 1024,
             "messages": [
                 {"role": "user", "content": prompt}
             ] }

    full_response = ""
    async with aiohttp.ClientSession() as session:
        async with session.post(url, headers=headers, json=body) as response:
            buffer = ""  # A buffer to hold incomplete lines of data
            async for chunk in response.content.iter_any():
                buffer += chunk.decode()
                while "\n" in buffer:  # Process as long as there are complete lines in the buffer
                    line, buffer = buffer.split("\n", 1)

                    if line.startswith("data: "):
                        event_data = line[len("data: "):]
                        if event_data != '[DONE]':
                            try:
                                current_text = json.loads(event_data)['choices'][0]['delta']['content']
                                full_response += current_text
                                yield full_response
                                await asyncio.sleep(0.01)
                            except Exception:
                                try:
                                    current_text = json.loads(event_data)['choices'][0]['text']
                                    full_response += current_text
                                    yield full_response
                                    await asyncio.sleep(0.01)
                                except Exception:
                                    pass
    
    final_metadata_block = ""

    final_metadata_block += f"Links visited:\n"
    for link in links:
        final_metadata_block += f"{link['title']} ({link['link']})\n"
    final_metadata_block += f"\nWeb search time: {websearch_time:.4f} seconds\n"
    final_metadata_block += f"\nText extraction: {webcrawl_time:.4f} seconds\n"
    final_metadata_block += f"\nEmbedding time: {embedding_time:.4f} seconds\n"
    final_metadata_block += f"\nRetrieval from VectorDB time: {retrieval_time:.4f} seconds"

    yield f"{full_response}\n\n{final_metadata_block}"

gr.ChatInterface(
    predict,
    title="Automated Arxiv Paper Search and Question Answering",
    description="Provide a search term and a question to find relevant papers and answer questions about them.",
    retry_btn=None,
    undo_btn=None,
    examples=[
        'Search: RAG LLM; Question: What are some challenges of implementing a system of RAG with LLMs?',
        'Search: LLM Self-Play; Question: What are the benefits of using self-play with LLMs?',
        'Search: Portable Blockchain; Question: How can a portable blockchain device be implemented?',
        'Search: 1.58 bit LLMs; Question: How do 1.58 bit LLMs work? Is there an available model to test?',
        'Search: Programação Robocode; Question: Como posso utilizar o robocode no contexto de aprendizagem de programação?',
        'Search: Pensamento Computacional; Question: Explique os conceitos do pensamento computacional.'
    ]
).launch()