File size: 6,862 Bytes
caf1faa
 
 
 
 
48e003d
 
 
 
 
 
 
 
 
 
 
caf1faa
 
 
 
 
48e003d
 
 
 
caf1faa
 
 
 
 
48e003d
caf1faa
 
 
 
 
 
 
 
 
 
 
 
48e003d
 
 
caf1faa
 
48e003d
 
 
 
 
 
 
 
caf1faa
48e003d
caf1faa
 
 
 
 
 
c3b815e
caf1faa
c3b815e
 
caf1faa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48e003d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

from langchain_core.retrievers import BaseRetriever
from langchain_core.vectorstores import VectorStoreRetriever
from langchain_core.documents.base import Document
from langchain_core.vectorstores import VectorStore
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun

from ..engine.utils import num_tokens_from_string

from typing import List
from pydantic import Field

from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
import pyalex

pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"


def replace_nan_with_empty_dict(x):
    return x if pd.notna(x) else {}

class OpenAlex():
    def __init__(self):
        pass


    def search(self,keywords:str,n_results = 100,after = None,before = None):

        if isinstance(keywords,str):
            works = Works().search(keywords)
            if after is not None:
                assert isinstance(after,int), "after must be an integer"
                assert after > 1900, "after must be greater than 1900"
                works = works.filter(publication_year=f">{after}")

            for page in works.paginate(per_page=n_results):
                break

            df_works = pd.DataFrame(page)
            df_works = df_works.dropna(subset = ["title"])
            df_works["primary_location"] = df_works["primary_location"].map(replace_nan_with_empty_dict)
            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x)).fillna("")
            df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
            df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
            df_works["url"] = df_works["id"]
            df_works["content"] = (df_works["title"] + "\n" + df_works["abstract"]).map(lambda x : x.strip())
            df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))

            df_works = df_works.drop(columns = ["abstract_inverted_index"])
            # df_works["subtitle"] = df_works["title"] + " - " + df_works["primary_location"]["source"]["display_name"] + " - " + df_works["publication_year"]
        
            return df_works
        else:
           raise Exception("Keywords must be a string")
    

    def rerank(self,query,df,reranker):
    
        scores = reranker.rank(
            query,
            df["content"].tolist()
        )
        scores = sorted(scores.results, key = lambda x : x.document.doc_id)
        scores = [x.score for x in scores]
        df["rerank_score"] = scores
        return df


    def make_network(self,df):

        # Initialize your graph
        G = nx.DiGraph()

        for i,row in df.iterrows():
            paper = row.to_dict()
            G.add_node(paper['id'], **paper)
            for reference in paper['referenced_works']:
                if reference not in G:
                    pass
                else:
                    # G.add_node(reference, id=reference, title="", reference_works=[], original=False)
                    G.add_edge(paper['id'], reference, relationship="CITING")
        return G

    def show_network(self,G,height = "750px",notebook = True,color_by = "pagerank"):

        net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="black",notebook = notebook,directed = True,neighborhood_highlight = True)
        net.force_atlas_2based()

        # Add nodes with size reflecting the PageRank to highlight importance
        pagerank = nx.pagerank(G)

        if color_by == "pagerank":
            color_scores = pagerank
        elif color_by == "rerank_score":
            color_scores = {node: G.nodes[node].get("rerank_score", 0) for node in G.nodes}
        else:
            raise ValueError(f"Unknown color_by value: {color_by}")

        # Normalize PageRank values to [0, 1] for color mapping
        min_score = min(color_scores.values())
        max_score = max(color_scores.values())
        norm_color_scores = {node: (color_scores[node] - min_score) / (max_score - min_score) for node in color_scores}



        for node in G.nodes:
            info = G.nodes[node]
            title = info["title"]
            label = title[:30] + " ..."

            title = [title,f"Year: {info['publication_year']}",f"ID: {info['id']}"]
            title = "\n".join(title)

            color_value = norm_color_scores[node]
            # Generating a color from blue (low) to red (high)
            color = plt.cm.RdBu_r(color_value) # coolwarm is a matplotlib colormap from blue to red
            def clamp(x): 
                return int(max(0, min(x*255, 255)))
            color = tuple([clamp(x) for x in color[:3]])
            color = '#%02x%02x%02x' % color
            
            net.add_node(node, title=title,size = pagerank[node]*1000,label = label,color = color)

        # Add edges
        for edge in G.edges:
            net.add_edge(edge[0], edge[1],arrowStrikethrough=True,color = "gray")

        # Show the network
        if notebook:
            return net.show("network.html")
        else:
            return net


    def get_abstract_from_inverted_index(self,index):

        if index is None:
            return ""
        else:

            # Determine the maximum index to know the length of the reconstructed array
            max_index = max([max(positions) for positions in index.values()])
            
            # Initialize a list with placeholders for all positions
            reconstructed = [''] * (max_index + 1)
            
            # Iterate through the inverted index and place each token at its respective position(s)
            for token, positions in index.items():
                for position in positions:
                    reconstructed[position] = token
            
            # Join the tokens to form the reconstructed sentence(s)
            return ' '.join(reconstructed)
        


class OpenAlexRetriever(BaseRetriever):
    min_year:int = 1960
    max_year:int = None
    k:int = 100

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        
        openalex = OpenAlex()

        # Search for documents
        df_docs = openalex.search(query,n_results=self.k,after = self.min_year,before = self.max_year)

        docs = []
        for i,row in df_docs.iterrows():
            num_tokens = row["num_tokens"]

            if num_tokens < 50 or num_tokens > 1000:
                continue

            doc = Document(
                page_content = row["content"],
                metadata = row.to_dict()
            )
            docs.append(doc)
        return docs