File size: 19,464 Bytes
ad66b01
61b2353
9764706
2e329bd
61b2353
 
 
9513d18
61b2353
 
62eaf4f
61b2353
 
 
a13fabc
 
 
 
 
 
 
61b2353
 
 
 
 
 
 
 
 
 
 
 
 
 
2e329bd
 
61b2353
 
 
 
 
 
 
 
 
 
 
 
 
 
2e329bd
61b2353
9513d18
 
 
 
ca2c7e8
 
 
848b14f
577d055
54261f6
577d055
9513d18
 
61b2353
 
 
9513d18
61b2353
 
 
 
 
 
 
 
 
 
 
 
2ca42fd
 
61b2353
 
 
 
 
9513d18
577d055
 
ca2c7e8
61b2353
 
 
 
 
 
 
 
 
07e2819
61b2353
07e2819
61b2353
5e9984e
 
26ddf5d
61b2353
ca2c7e8
 
88cff0c
a5f46a9
61b2353
577d055
848b14f
0bf43b3
577d055
 
952bac3
52717e9
577d055
a13fabc
 
 
 
 
 
26ddf5d
577d055
0bf43b3
577d055
62eaf4f
 
 
 
 
 
ad66b01
62eaf4f
 
 
 
ad66b01
62eaf4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad66b01
 
62eaf4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26ddf5d
62eaf4f
 
 
 
 
 
 
26ddf5d
62eaf4f
 
 
 
 
ad66b01
 
 
26ddf5d
ad66b01
26ddf5d
 
62eaf4f
26ddf5d
62eaf4f
 
ad66b01
62eaf4f
ad66b01
 
 
62eaf4f
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
from fastapi import FastAPI, Header
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from contextlib import asynccontextmanager
import xml.etree.ElementTree as xmlparser
import requests
from pydantic import BaseModel
import sys
import fitz
import re,os,json
from io import BytesIO
from datetime import datetime

def remove_in_betweens(text):
    removed_brackets = re.sub(r'\[.*?\]', ' ', text)
    removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
    return removed_parentheses

def remove_punctuations(text):
    return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)

def receive_signal(signalNumber, frame):
    print('Received:', signalNumber)
    sys.exit()


@asynccontextmanager
async def lifespan(app: FastAPI):
    import signal
    signal.signal(signal.SIGINT, receive_signal)
    yield

app = FastAPI(lifespan=lifespan)

app.mount("/static", StaticFiles(directory="static"), name="static")

origins = [
    "*",
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.get("/")
async def root():
    return FileResponse(os.path.join("templates", "index.html"))

class Query(BaseModel):
    keyword: str
    limit: int

class DocumentID(BaseModel):
    doc_id: str

class PDF(BaseModel):
    url: str
    page_num: int = -1

@app.post("/search")
async def get_articles(query: Query):
    XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
    content = {}
    try:
        arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{query.keyword}&max_results={query.limit}", verify=False)
        response = xmlparser.fromstring(arxiv_search_result.text)
        publications = response.findall(f"{XML_NAMESPACE}entry")
        for pub in publications:
            id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1]
            title_pub = pub.find(f"{XML_NAMESPACE}title").text
            authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")])
            pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y")
            abstract = pub.find(f"{XML_NAMESPACE}summary").text
            content[id_pub] = {
                "title": title_pub,
                "authors": authors,
                "date": pub_date,
                "abstract": abstract,
                "pdf": f"http://arxiv.org/pdf/{id_pub}"
            }
        return {"error": False, "message": content}
    except Exception as e:
        print(f"Error while downloading data : {str(e)}")
        return {"error": True, "message": str(e)}
    
@app.post("/extract_pdf/arxiv_id")
async def extract_arxiv_pdf(document: DocumentID):
    pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
    if pdf_req.status_code == 200:
        pdf_data = BytesIO(pdf_req.content)
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        pdf_text = " ".join([page.get_text("text") for page in doc])
        ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE)
        if ref_pos:
            ref_pos = ref_pos.end()

        if ref_pos is not None:
            pdf_text = pdf_text[:ref_pos - 10]

        postprocess_text = remove_in_betweens(pdf_text)
        postprocess_text = remove_punctuations(postprocess_text)
        postprocess_text = re.sub(r"\s+", " ", postprocess_text)
        postprocess_text = postprocess_text.strip()
        return {"error": False, "message": {"pub_id": document.doc_id, "text": postprocess_text}}
    else:
        print("ID: " + document.doc_id)
        print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}")
        print("Status code: " + str(pdf_req.status_code))
        return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}

@app.post("/extract_pdf/url")
async def extract_pdf(pdf: PDF):
    pdf_req = requests.get(pdf.url)
    if pdf_req.status_code == 200:
        pdf_data = BytesIO(pdf_req.content)
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        pdf_text = " ".join([doc[page].get_text("text") for page in range(doc.page_count if pdf.page_num == -1 else pdf.page_num)])
        pdf_metadata = doc.metadata
        print(pdf_metadata)

        postprocess_text = remove_in_betweens(pdf_text)
        postprocess_text = remove_punctuations(postprocess_text)
        postprocess_text = re.sub(r"\s+", " ", postprocess_text)
        postprocess_text = postprocess_text.strip()
        return {"error": False, "message": {"title": pdf_metadata.get("title", "No title found").strip(), "text": postprocess_text}}
    else:
        print("URL: " + pdf.url)
        print("Status code: " + str(pdf_req.status_code))
        return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}

def researcher(model, user, token):
  url = 'https://api.groq.com/openai/v1/chat/completions'
  headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {token}',
  }
  system_msg = {
      'role': 'system',
      'content': (
          'You are an experience PhD professor with 20 years experience in research. You help the user build their research plan based on the following examples. build the plan according to the examples without further questions. provide the steps of the plan in a form of research requests to search engines of public document publisher or web searching purposes, nothing else:\n'
          '''<example>
<search-request>
Help me research recent AI-powered marketing campaigns to benchmark for 2025 planning
</search-request>
<search-plan>
Help me research recent AI-powered marketing campaigns to benchmark for 2025 planning by:
(1) Find articles and case studies on AI-powered marketing campaigns in 2024.
(2) Find information on the specific AI technologies used in these campaigns (e.g., generative AI, predictive analytics).
(3) Find data on the results of these campaigns (e.g., ROI, customer engagement).
(4) Find information on the challenges and limitations of using AI in marketing.
(5) Find information on emerging trends in AI-powered marketing for 2025.
(6) Based on the above information, create a report summarizing key takeaways.
(7) Create a SWOT analysis of AI-powered marketing campaigns.
</search-plan>
</example>\n'''
          '''<example>
<search-request>
Research AI models and compare them per use cases for a guide on which model to use for which use case
</search-request>
<search-plan>
Research AI models and compare them per use cases for a guide on which model to use for which use case by:
(1) Find a list of popular AI models and categorize them by type (e.g., image generation, language processing, etc.).
(2) For each AI model, find information on its strengths, weaknesses, and common use cases.
(3) Compare and contrast the AI models within each category based on their performance, ease of use, and cost.
(4) Find real-world examples of how each AI model is being used in different industries and applications.
(5) Create a guide that recommends specific AI models for different use cases, taking into account factors such as accuracy, speed, and cost.
(6) Include a disclaimer in the guide stating that the recommendations are based on current knowledge and may change as AI technology evolves.
</search-plan>
</example>\n'''
          '''<example>
<search-request>
research Open source threat or opportunities to 6G standardization bodies such as 3GPP for thought leadership paper
</search-request>
<search-plan>
Research Open source threat or opportunities to 6G standardization bodies such as 3GPP for thought leadership paper:
(1) Find information on the role of 3GPP in 6G standardization.
(2) Find information on open source initiatives in the 6G space.
(3) Find articles or reports discussing the potential impact of open source on 6G standardization.
(4) Find information on the benefits and challenges of open source for 6G standardization.
(5) Find information on how 3GPP is addressing the challenges of open source.
(6) Find examples of successful open source initiatives in other technology domains.
(7) Based on your research, develop a point of view on the threat or opportunity of open source to 6G standardization bodies like 3GPP.
</search-plan>
</example>\n'''
    '''<example>
<search-request>
research vodafone activities in 6G for a competition analysis include a SWOT analysis
</search-request>
<search-plan>
Research Vodafone's activities in 6G for a competition analysis, including a SWOT analysis, by:
(1) Find Vodafone's public statements and press releases about their 6G research and development efforts.
(2) Find news articles and industry reports about Vodafone's 6G activities.
(3) Find information about Vodafone's partnerships and collaborations in the 6G space.
(4) Find information about Vodafone's investments in 6G infrastructure and technology.
(5) Based on the information gathered, create a SWOT analysis of Vodafone's position in the 6G landscape:
(a) Strengths: Vodafone's existing infrastructure, expertise, and partnerships.
(b) Weaknesses: Vodafone's potential challenges in competing with larger or more established players in the 6G space.
(c) Opportunities: Emerging 6G technologies and market trends that Vodafone could leverage.
(d) Threats: Competition from other companies, regulatory hurdles, and technological uncertainties.
(6) Compare Vodafone's 6G activities to those of its main competitors (e.g., Ericsson, Nokia, Huawei, Samsung) to identify areas of strength and weakness.
(7) Consider any recent developments or announcements in the 6G space that could impact Vodafone's competitive position.
</search-plan>
</example>\n'''
    '''<example>
<search-request>
report on researches on 6G energy efficiency, how to achieve it, assess the reality of these findings or solutions for a technical paper on 6G environmental impact
</search-request>
<search-plan>
Report on researches on 6G energy efficiency, how to achieve it, assess the reality of these findings or solutions for a technical paper on 6G environmental impact by:
(1) Find research papers and articles on 6G energy efficiency.
(2) Summarize the findings of these researches on 6G energy efficiency.
(3) Find proposed solutions to achieve 6G energy efficiency.
(4) Assess the feasibility and potential impact of these solutions.
(5) Find any existing case studies or pilot projects implementing these solutions.
(6) Find information on the potential environmental benefits of 6G energy efficiency.
(7) Find information on the challenges and limitations of achieving 6G energy efficiency.
(8) Find information on the potential economic benefits of 6G energy efficiency.
</search-plan>
</example>\n'''
    '''<example>
<search-request>
research Authentication and Identity Management:
Study lightweight, low-energy authentication methods for IoT and other connected devices.
Explore advancements in identity and access management for 6G networks.
</search-request>
<search-plan>
Research Authentication and Identity Management: Study lightweight, low-energy authentication methods for IoT and other connected devices. Explore advancements in identity and access management for 6G networks by:
(1) Find research papers and articles on lightweight, low-energy authentication methods for IoT and other connected devices.
(2) Find research papers and articles on advancements in identity and access management for 6G networks.
(3) Find information on the current state of authentication and identity management for IoT and other connected devices.
(4) Find information on the challenges and opportunities in authentication and identity management for 6G networks.
(5) Find information on the different types of authentication methods available for IoT and other connected devices.
(6) Find information on the different types of identity and access management systems available for 6G networks.
(7) Compare and contrast the different authentication methods and identity and access management systems.
(8) Identify potential areas for future research in authentication and identity management for IoT and other connected devices, as well as for 6G networks.
</search-plan>
</example>\n'''
    '''<example>
<search-request>
research Global Market Dynamics:
Assess which stakeholders (e.g., operators, tech companies, governments) are likely to drive investment in 6G infrastructure.
Investigate the role of new entrants, such as GAFAM (Google, Amazon, Facebook, Apple, Microsoft), in shaping the 6G ecosystem.
Explore how regions like China, the EU, and the US are positioning themselves for 6G leadership.
</search-request>
<search-plan>
Research Global Market Dynamics: Assess which stakeholders (e.g., operators, tech companies, governments) are likely to drive investment in 6G infrastructure. Investigate the role of new entrants, such as GAFAM (Google, Amazon, Facebook, Apple, Microsoft), in shaping the 6G ecosystem. Explore how regions like China, the EU, and the US are positioning themselves for 6G leadership by:
(1) Find research reports and articles on the 6G market and its potential stakeholders.
(2) Find information on the current investments and initiatives of major telecom operators in 6G. If there are too many, limit to several that are most relevant.
(3) Find information on the R&D efforts of major tech companies, including GAFAM, in 6G technologies.
(4) Find information on government policies and funding initiatives related to 6G in China, the EU, and the US.
(5) Find expert opinions and analysis on the potential drivers of 6G investment and the role of new entrants.
(6) Find information on the potential impact of 6G on various industries and sectors.
(7) Find information on the potential challenges and barriers to 6G adoption and deployment.
(8) Find information on the potential timeline for 6G commercialization and deployment.
</search-plan>
</example>\n'''
    '''<example>
<search-request>
research how new business models beyond ROI could benefit societal impact of 6G
</search-request>
<search-plan>
research how new business models beyond ROI could benefit societal impact of 6G by:
(1) Find articles and research papers discussing the potential societal impact of 6G technology.
(2) Find articles and research papers discussing current business models used in the telecommunications industry and their limitations.
(3) Find articles and research papers discussing alternative business models that could be used to fund and deploy 6G technology.
(4) Find articles and research papers discussing how new business models could be used to maximize the societal benefits of 6G technology.
(5) Find case studies of companies or organizations that have successfully implemented alternative business models to achieve social impact goals.
(6) Find information on potential risks and challenges associated with new business models for 6G technology.
</search-plan>
</example>\n'''
    'Optionally, do not precise the sources, as we search on every websites that we possibly can. Take note that sometimes, the user will send you keywords only, just provide report of them.\n'
    'For the response format, you must send a JSON of this format : [{"step_index": The step number, "step_text": What we have to do, "keywords": The important keywords separated by spaces (no comma) (important: the keywords that we will use for search engines and APIs, so get rid of `research papers`, `articles`, ... keywords), `privilegie les abbreviations`}, ...] Take those for examples :\n'
    '''<example><search-request>Provide a plan for 6G challenges</search-request><search-plan>[
    {
        "step_index": 1,
        "step_text": "Find information on technical challenges in 6G development",
        "keywords": "6G technical challenges development hurdles"
    },
    {
        "step_index": 2,
        "step_text": " Identify key challenges in 6G standardization",
        "keywords": "6G standardization challenges 3GPP"
    },
    {
        "step_index": 3,
        "step_text": "Investigate security challenges in 6G networks",
        "keywords": "6G security threats vulnerabilities"
    },
    {
        "step_index": 4,
        "step_text": "Explore challenges in 6G deployment and implementation",
        "keywords": "6G deployment implementation rollout"
    },
    {
        "step_index": 5,
        "step_text": "Find information on energy efficiency challenges in 6G",
        "keywords": "6G energy efficiency power consumption"
    },
    {
        "step_index": 6,
        "step_text": "Investigate challenges in 6G spectrum management",
        "keywords": "6G spectrum frequency management"
    },
    {
        "step_index": 7,
        "step_text": "Analyze challenges in 6G device and hardware development",
        "keywords": "6G devices hardware UE"
    },
    {
        "step_index": 8,
        "step_text": "Explore challenges in 6G network architecture and design",
        "keywords": "6G network architecture network design RAN"
    },
    {
        "step_index": 9,
        "step_text": "Find information on challenges in 6G testing and validation",
        "keywords": "6G testing validation trial"
    },
    {
        "step_index": 10,
        "step_text": "Investigate challenges in 6G regulation and policy-making",
        "keywords": "6G regulation policy governance"
    }
]</search-plan></example>'''
      )
  }
  user_msg = {
      'role': 'user',
      'content': user
  }
  data = {
      'model': model,
      'messages': [system_msg, user_msg]
  }

  response = requests.post(url, headers=headers, data=json.dumps(data), verify=False)
  if response.status_code != 200:
    print(f"Groq API error on post: {response.status_code}")
    return None

  try:
    response_data = response.json()
    raw_content = response_data['choices'][0]['message']['content'].strip()
    return raw_content
  except Exception as e:
    print(f"Groq API error after post: {str(e)}")
    return None

class GroqRequest(BaseModel):
    model: str
    user: str

@app.post("/search/plan")
async def get_research_plan(infos: GroqRequest, api_key: str = Header(None, alias="GROQ_TOKEN")):
    if api_key is None:
        return {"error": True, "message": "Missing API key"}
    plan = researcher(infos.model, infos.user, api_key)
    if plan is None:
        return {"error": True, "message": "Error while generating the research plan"}
    plan = json.loads(re.sub(r"\s+", " ", plan))
    return {"error": False, "message": {"plan": plan}}

@app.post("/search/plan/arxiv")
async def get_arxiv_research_plan(infos: GroqRequest, api_key: str = Header(None, alias="GROQ_TOKEN")):
    plan_articles = []
    plan = get_research_plan(infos, api_key)["plan"]
    if plan == "":
        return {"error": True, "message": "Error while generating the research plan"}
    for step in plan:
        index, inst, kws = step.values()
        data = await get_articles(Query(keyword=kws, limit=5))
        if not data["error"]:
            publications = data["message"]
        else:
            print(data["message"])
            continue
        plan_articles.append({'step_id': index, 'request': inst, 'articles': [x for x in publications.keys()]})
    return {"error": False, "message": plan_articles}