Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

TheoLvs commited on Mar 25, 2024

Commit

caf1faa

1 Parent(s): 001af11

Experimental openalex feature

Browse files

Files changed (8) hide show

Ekimetrics_Logo_Color.jpg +0 -0
app.py +121 -46
climateqa/engine/keywords.py +30 -0
climateqa/engine/prompts.py +26 -0
climateqa/engine/rag.py +34 -6
climateqa/papers/__init__.py +43 -0
climateqa/papers/openalex.py +142 -0
requirements.txt +5 -2

Ekimetrics_Logo_Color.jpg DELETED Viewed

Binary file (76.8 kB)

app.py CHANGED Viewed

@@ -1,6 +1,11 @@
 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
 import gradio as gr
 import pandas as pd
@@ -32,6 +37,8 @@ from climateqa.engine.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
 from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
 # Load environment variables in local mode
 try:
@@ -141,19 +148,20 @@ async def chat(query,history,audience,sources,reports):
     # result = rag_chain.stream(inputs)
     path_reformulation = "/logs/reformulation/final_output"
     path_retriever = "/logs/find_documents/final_output"
     path_answer = "/logs/answer/streamed_output_str/-"
     docs_html = ""
     output_query = ""
     output_language = ""
     gallery = []
     try:
         async for op in result:
             op = op.ops[0]
-            # print("ITERATION",op)
             if op['path'] == path_reformulation: # reforulated question
                 try:
@@ -162,6 +170,14 @@ async def chat(query,history,audience,sources,reports):
                 except Exception as e:
                     raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
             elif op['path'] == path_retriever: # documents
                 try:
                     docs = op['value']['docs'] # List[Document]
@@ -183,23 +199,13 @@ async def chat(query,history,audience,sources,reports):
                 answer_yet = parse_output_llm_with_sources(answer_yet)
                 history[-1] = (query,answer_yet)
-            # elif op['path'] == final_output_path_id:
-            #     final_output = op['value']
-            #     if "answer" in final_output:
-            #         final_output = final_output["answer"]
-            #         print(final_output)
-            #         answer = history[-1][1] + final_output
-            #         answer = parse_output_llm_with_sources(answer)
-            #         history[-1] = (query,answer)
             else:
                 continue
             history = [tuple(x) for x in history]
-            yield history,docs_html,output_query,output_language,gallery
     except Exception as e:
         raise gr.Error(f"{e}")
@@ -267,37 +273,7 @@ async def chat(query,history,audience,sources,reports):
     #     gallery = list(set("|".join(gallery).split("|")))
     #     gallery = [get_image_from_azure_blob_storage(x) for x in gallery]
-    yield history,docs_html,output_query,output_language,gallery
-    # memory.save_context(inputs, {"answer": gradio_format[-1][1]})
-    # yield gradio_format, memory.load_memory_variables({})["history"], source_string
-# async def chat_with_timeout(query, history, audience, sources, reports, timeout_seconds=2):
-#     async def timeout_gen(async_gen, timeout):
-#         try:
-#             while True:
-#                 try:
-#                     yield await asyncio.wait_for(async_gen.__anext__(), timeout)
-#                 except StopAsyncIteration:
-#                     break
-#         except asyncio.TimeoutError:
-#             raise gr.Error("Operation timed out. Please try again.")
-#     return timeout_gen(chat(query, history, audience, sources, reports), timeout_seconds)
-# # A wrapper function that includes a timeout
-# async def chat_with_timeout(query, history, audience, sources, reports, timeout_seconds=2):
-#     try:
-#         # Use asyncio.wait_for to apply a timeout to the chat function
-#         return await asyncio.wait_for(chat(query, history, audience, sources, reports), timeout_seconds)
-#     except asyncio.TimeoutError:
-#         # Handle the timeout error as desired
-#         raise gr.Error("Operation timed out. Please try again.")
 def make_html_source(source,i):
@@ -392,6 +368,79 @@ def log_on_azure(file, logs, share_client):
     file_client.upload_file(logs)
 # --------------------------------------------------------------------
 # Gradio
 # --------------------------------------------------------------------
@@ -474,7 +523,7 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
                             samples.append(group_examples)
-                    with gr.Tab("Citations",elem_id = "tab-citations",id = 1):
                         sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
                         docs_textbox = gr.State("")
@@ -513,6 +562,7 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
@@ -521,6 +571,28 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
     with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
         gallery_component = gr.Gallery()
     with gr.Tab("About",elem_classes = "max-height other-tabs"):
         with gr.Row():
             with gr.Column(scale=1):
@@ -537,13 +609,13 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
-        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
-        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_examples")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
@@ -558,6 +630,9 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
     # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
     # (textbox
     #     .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)

 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
+from climateqa.papers.openalex import OpenAlex
+from sentence_transformers import CrossEncoder
+reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
+oa = OpenAlex()
 import gradio as gr
 import pandas as pd
 from climateqa.sample_questions import QUESTIONS
 from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
+from climateqa.engine.keywords import make_keywords_chain
+from climateqa.engine.rag import make_rag_papers_chain
 # Load environment variables in local mode
 try:
     # result = rag_chain.stream(inputs)
     path_reformulation = "/logs/reformulation/final_output"
+    path_keywords = "/logs/keywords/final_output"
     path_retriever = "/logs/find_documents/final_output"
     path_answer = "/logs/answer/streamed_output_str/-"
     docs_html = ""
     output_query = ""
     output_language = ""
+    output_keywords = ""
     gallery = []
     try:
         async for op in result:
             op = op.ops[0]
             if op['path'] == path_reformulation: # reforulated question
                 try:
                 except Exception as e:
                     raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
+            if op["path"] == path_keywords:
+                try:
+                    output_keywords = op['value']["keywords"] # str
+                    output_keywords = " AND ".join(output_keywords)
+                except Exception as e:
+                    pass
             elif op['path'] == path_retriever: # documents
                 try:
                     docs = op['value']['docs'] # List[Document]
                 answer_yet = parse_output_llm_with_sources(answer_yet)
                 history[-1] = (query,answer_yet)
             else:
                 continue
             history = [tuple(x) for x in history]
+            yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
     except Exception as e:
         raise gr.Error(f"{e}")
     #     gallery = list(set("|".join(gallery).split("|")))
     #     gallery = [get_image_from_azure_blob_storage(x) for x in gallery]
+    yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
 def make_html_source(source,i):
     file_client.upload_file(logs)
+def generate_keywords(query):
+    chain = make_keywords_chain(llm)
+    keywords = chain.invoke(query)
+    keywords = " AND ".join(keywords["keywords"])
+    return keywords
+papers_cols_widths = {
+    "doc":50,
+    "id":100,
+    "title":300,
+    "doi":100,
+    "publication_year":100,
+    "abstract":500,
+    "rerank_score":100,
+    "is_oa":50,
+}
+papers_cols = list(papers_cols_widths.keys())
+papers_cols_widths = list(papers_cols_widths.values())
+async def find_papers(query, keywords,after):
+    summary = ""
+    df_works = oa.search(keywords,after = after)
+    df_works = df_works.dropna(subset=["abstract"])
+    df_works = oa.rerank(query,df_works,reranker)
+    df_works = df_works.sort_values("rerank_score",ascending=False)
+    G = oa.make_network(df_works)
+    height = "750px"
+    network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
+    network_html = network.generate_html()
+    network_html = network_html.replace("'", "\"")
+    css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
+    network_html = network_html + css_to_inject
+    network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
+    docs = df_works["content"].head(15).tolist()
+    df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
+    df_works["doc"] = df_works["doc"] + 1
+    df_works = df_works[papers_cols]
+    yield df_works,network_html,summary
+    chain = make_rag_papers_chain(llm)
+    result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
+    path_answer = "/logs/StrOutputParser/streamed_output/-"
+    async for op in result:
+        op = op.ops[0]
+        if op['path'] == path_answer: # reforulated question
+            new_token = op['value'] # str
+            summary += new_token
+        else:
+            continue
+        yield df_works,network_html,summary
 # --------------------------------------------------------------------
 # Gradio
 # --------------------------------------------------------------------
                             samples.append(group_examples)
+                    with gr.Tab("Sources",elem_id = "tab-citations",id = 1):
                         sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
                         docs_textbox = gr.State("")
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
     with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
         gallery_component = gr.Gallery()
+    with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
+                keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
+                after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
+                search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
+            with gr.Column(scale=7):
+                with gr.Tab("Summary",elem_id="papers-summary-tab"):
+                    papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
+                with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
+                    papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
+                with gr.Tab("Citations network",elem_id="papers-network-tab"):
+                    citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
     with gr.Tab("About",elem_classes = "max-height other-tabs"):
         with gr.Row():
             with gr.Column(scale=1):
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
+        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
+        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_examples")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
+    query_papers.submit(generate_keywords,[query_papers], [keywords_papers])
+    search_papers.click(find_papers,[query_papers,keywords_papers,after], [papers_dataframe,citations_network,papers_summary])
     # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
     # (textbox
     #     .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)

climateqa/engine/keywords.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from typing import List
+from typing import Literal
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class KeywordsOutput(BaseModel):
+    """Analyzing the user query to get keywords for a search engine"""
+    keywords: list = Field(
+        description="""
+        Generate 1 or 2 relevant keywords from the user query to ask a search engine for scientific research papers.
+        Example:
+        - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
+        - "How will El Nino be impacted by climate change" -> ["el nino"]
+        - "Is climate change a hoax" -> [Climate change","hoax"]
+        """
+    )
+def make_keywords_chain(llm):
+    functions = [convert_to_openai_function(KeywordsOutput)]
+    llm_functions = llm.bind(functions = functions,function_call={"name":"KeywordsOutput"})
+    chain = llm_functions | JsonOutputFunctionsParser()
+    return chain

climateqa/engine/prompts.py CHANGED Viewed

@@ -60,6 +60,32 @@ Question: {question} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
 answer_prompt_images_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics.
 You are given the answer to a environmental question based on passages from the IPCC and IPBES reports and image captions.

 Answer in {language} with the passages citations:
 """
+papers_prompt_template = """
+You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted abstracts of scientific papers. Provide a clear and structured answer based on the abstracts provided, the context and the guidelines.
+Guidelines:
+- If the passages have useful facts or numbers, use them in your answer.
+- When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
+- Do not use the sentence 'Doc i says ...' to say where information came from.
+- If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
+- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
+- If it makes sense, use bullet points and lists to make your answers easier to understand.
+- Use markdown to format your answer and make it easier to read.
+- You do not need to use every passage. Only use the ones that help answer the question.
+- If the documents do not have the information needed to answer the question, just say you do not have enough information.
+-----------------------
+Abstracts:
+{context}
+-----------------------
+Question: {question}
+Answer in {language} with the passages citations:
+"""
 answer_prompt_images_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics.
 You are given the answer to a environmental question based on passages from the IPCC and IPBES reports and image captions.

climateqa/engine/rag.py CHANGED Viewed

@@ -8,7 +8,9 @@ from langchain_core.prompts.base import format_document
 from climateqa.engine.reformulation import make_reformulation_chain
 from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
 from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
 DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
@@ -21,7 +23,11 @@ def _combine_documents(
     for i,doc in enumerate(docs):
         # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
         chunk_type = "Doc"
-        doc_string = f"{chunk_type} {i+1}: " + format_document(doc, document_prompt)
         doc_string = doc_string.replace("\n"," ")
         doc_strings.append(doc_string)
@@ -37,7 +43,6 @@ def get_image_docs(x):
 def make_rag_chain(retriever,llm):
     # Construct the prompt
     prompt = ChatPromptTemplate.from_template(answer_prompt_template)
     prompt_without_docs = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
@@ -46,6 +51,11 @@ def make_rag_chain(retriever,llm):
     reformulation = make_reformulation_chain(llm)
     reformulation = prepare_chain(reformulation,"reformulation")
     # ------- CHAIN 1
     # Retrieved documents
     find_documents = {"docs": itemgetter("question") | retriever} | RunnablePassthrough()
@@ -55,7 +65,7 @@ def make_rag_chain(retriever,llm):
     # Construct inputs for the llm
     input_documents = {
         "context":lambda x : _combine_documents(x["docs"]),
-        **pass_values(["question","audience","language"])
     }
     # ------- CHAIN 3
@@ -64,12 +74,12 @@ def make_rag_chain(retriever,llm):
     answer_with_docs = {
         "answer": input_documents | prompt | llm_final | StrOutputParser(),
-        **pass_values(["question","audience","language","query","docs"]),
     }
     answer_without_docs = {
         "answer":  prompt_without_docs | llm_final | StrOutputParser(),
-        **pass_values(["question","audience","language","query","docs"]),
     }
     # def has_images(x):
@@ -87,11 +97,29 @@ def make_rag_chain(retriever,llm):
     # ------- FINAL CHAIN
     # Build the final chain
-    rag_chain = reformulation | find_documents | answer
     return rag_chain
 def make_illustration_chain(llm):

 from climateqa.engine.reformulation import make_reformulation_chain
 from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
+from climateqa.engine.prompts import papers_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
+from climateqa.engine.keywords import make_keywords_chain
 DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
     for i,doc in enumerate(docs):
         # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
         chunk_type = "Doc"
+        if isinstance(doc,str):
+            doc_formatted = doc
+        else:
+            doc_formatted = format_document(doc, document_prompt)
+        doc_string = f"{chunk_type} {i+1}: " + doc_formatted
         doc_string = doc_string.replace("\n"," ")
         doc_strings.append(doc_string)
 def make_rag_chain(retriever,llm):
     # Construct the prompt
     prompt = ChatPromptTemplate.from_template(answer_prompt_template)
     prompt_without_docs = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
     reformulation = make_reformulation_chain(llm)
     reformulation = prepare_chain(reformulation,"reformulation")
+    # ------- Find all keywords from the reformulated query
+    keywords = make_keywords_chain(llm)
+    keywords = {"keywords":itemgetter("question") | keywords}
+    keywords = prepare_chain(keywords,"keywords")
     # ------- CHAIN 1
     # Retrieved documents
     find_documents = {"docs": itemgetter("question") | retriever} | RunnablePassthrough()
     # Construct inputs for the llm
     input_documents = {
         "context":lambda x : _combine_documents(x["docs"]),
+        **pass_values(["question","audience","language","keywords"])
     }
     # ------- CHAIN 3
     answer_with_docs = {
         "answer": input_documents | prompt | llm_final | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs","keywords"]),
     }
     answer_without_docs = {
         "answer":  prompt_without_docs | llm_final | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs","keywords"]),
     }
     # def has_images(x):
     # ------- FINAL CHAIN
     # Build the final chain
+    rag_chain = reformulation | keywords | find_documents | answer
     return rag_chain
+def make_rag_papers_chain(llm):
+    prompt = ChatPromptTemplate.from_template(papers_prompt_template)
+    input_documents = {
+        "context":lambda x : _combine_documents(x["docs"]),
+        **pass_values(["question","language"])
+    }
+    chain = input_documents | prompt | llm | StrOutputParser()
+    chain = rename_chain(chain,"answer")
+    return chain
 def make_illustration_chain(llm):

climateqa/papers/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import pandas as pd
+from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
+import pyalex
+pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
+class OpenAlex():
+    def __init__(self):
+        pass
+    def search(self,keywords,n_results = 100,after = None,before = None):
+        works = Works().search(keywords).get()
+        for page in works.paginate(per_page=n_results):
+            break
+        df_works = pd.DataFrame(page)
+        return works
+    def make_network(self):
+        pass
+    def get_abstract_from_inverted_index(self,index):
+        # Determine the maximum index to know the length of the reconstructed array
+        max_index = max([max(positions) for positions in index.values()])
+        # Initialize a list with placeholders for all positions
+        reconstructed = [''] * (max_index + 1)
+        # Iterate through the inverted index and place each token at its respective position(s)
+        for token, positions in index.items():
+            for position in positions:
+                reconstructed[position] = token
+        # Join the tokens to form the reconstructed sentence(s)
+        return ' '.join(reconstructed)

climateqa/papers/openalex.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import pandas as pd
+import networkx as nx
+import matplotlib.pyplot as plt
+from pyvis.network import Network
+from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
+import pyalex
+pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
+class OpenAlex():
+    def __init__(self):
+        pass
+    def search(self,keywords,n_results = 100,after = None,before = None):
+        if isinstance(keywords,str):
+            works = Works().search(keywords)
+            if after is not None:
+                assert isinstance(after,int), "after must be an integer"
+                assert after > 1900, "after must be greater than 1900"
+                works = works.filter(publication_year=f">{after}")
+            for page in works.paginate(per_page=n_results):
+                break
+            df_works = pd.DataFrame(page)
+            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x))
+            df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
+            df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
+            df_works["content"] = df_works["title"] + "\n" + df_works["abstract"]
+        else:
+            df_works = []
+            for keyword in keywords:
+                df_keyword = self.search(keyword,n_results = n_results,after = after,before = before)
+                df_works.append(df_keyword)
+            df_works = pd.concat(df_works,ignore_index=True,axis = 0)
+        return df_works
+    def rerank(self,query,df,reranker):
+        scores = reranker.rank(
+            query,
+            df["content"].tolist(),
+            top_k = len(df),
+        )
+        scores.sort(key = lambda x : x["corpus_id"])
+        scores = [x["score"] for x in scores]
+        df["rerank_score"] = scores
+        return df
+    def make_network(self,df):
+        # Initialize your graph
+        G = nx.DiGraph()
+        for i,row in df.iterrows():
+            paper = row.to_dict()
+            G.add_node(paper['id'], **paper)
+            for reference in paper['referenced_works']:
+                if reference not in G:
+                    pass
+                else:
+                    # G.add_node(reference, id=reference, title="", reference_works=[], original=False)
+                    G.add_edge(paper['id'], reference, relationship="CITING")
+        return G
+    def show_network(self,G,height = "750px",notebook = True,color_by = "pagerank"):
+        net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="black",notebook = notebook,directed = True,neighborhood_highlight = True)
+        net.force_atlas_2based()
+        # Add nodes with size reflecting the PageRank to highlight importance
+        pagerank = nx.pagerank(G)
+        if color_by == "pagerank":
+            color_scores = pagerank
+        elif color_by == "rerank_score":
+            color_scores = {node: G.nodes[node].get("rerank_score", 0) for node in G.nodes}
+        else:
+            raise ValueError(f"Unknown color_by value: {color_by}")
+        # Normalize PageRank values to [0, 1] for color mapping
+        min_score = min(color_scores.values())
+        max_score = max(color_scores.values())
+        norm_color_scores = {node: (color_scores[node] - min_score) / (max_score - min_score) for node in color_scores}
+        for node in G.nodes:
+            info = G.nodes[node]
+            title = info["title"]
+            label = title[:30] + " ..."
+            title = [title,f"Year: {info['publication_year']}",f"ID: {info['id']}"]
+            title = "\n".join(title)
+            color_value = norm_color_scores[node]
+            # Generating a color from blue (low) to red (high)
+            color = plt.cm.RdBu_r(color_value) # coolwarm is a matplotlib colormap from blue to red
+            def clamp(x):
+                return int(max(0, min(x*255, 255)))
+            color = tuple([clamp(x) for x in color[:3]])
+            color = '#%02x%02x%02x' % color
+            net.add_node(node, title=title,size = pagerank[node]*1000,label = label,color = color)
+        # Add edges
+        for edge in G.edges:
+            net.add_edge(edge[0], edge[1],arrowStrikethrough=True,color = "gray")
+        # Show the network
+        if notebook:
+            return net.show("network.html")
+        else:
+            return net
+    def get_abstract_from_inverted_index(self,index):
+        if index is None:
+            return ""
+        else:
+            # Determine the maximum index to know the length of the reconstructed array
+            max_index = max([max(positions) for positions in index.values()])
+            # Initialize a list with placeholders for all positions
+            reconstructed = [''] * (max_index + 1)
+            # Iterate through the inverted index and place each token at its respective position(s)
+            for token, positions in index.items():
+                for position in positions:
+                    reconstructed[position] = token
+            # Join the tokens to form the reconstructed sentence(s)
+            return ' '.join(reconstructed)

requirements.txt CHANGED Viewed

@@ -5,6 +5,9 @@ python-dotenv==1.0.0
 langchain==0.1.4
 langchain_openai==0.0.6
 pinecone-client==3.0.2
-sentence-transformers
 huggingface-hub
-msal

 langchain==0.1.4
 langchain_openai==0.0.6
 pinecone-client==3.0.2
+sentence-transformers==2.6.0
 huggingface-hub
+msal
+pyalex==0.13
+networkx==3.2.1
+pyvis==0.3.2