kiyer commited on
Commit
a354504
1 Parent(s): 1fa5fdb

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  local_files/galaxy_worldmap_kiyer-min.png filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  local_files/galaxy_worldmap_kiyer-min.png filter=lfs diff=lfs merge=lfs -text
37
+ data/astrophindex.faiss filter=lfs diff=lfs merge=lfs -text
data/astrophindex.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10089ae87900eb8c9f1698c36bb5d1128d712b7b90f3f3d07f074ab130027440
3
+ size 2163879981
data/data-00000-of-00008.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd656a426f8ff3d7c2bb7164154b042d50f5e5deeeb7ead12b9baee7b9d5f8d
3
+ size 509410376
data/data-00001-of-00008.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:952f8125fc109ff15b7abbfb10ac928b6070087899e79ebbf62500b018a2bac7
3
+ size 503809992
data/data-00002-of-00008.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1069ba9c791822f5761b7311f69bc0323294751499988bcb3ae904b339a89dd
3
+ size 504473320
data/data-00003-of-00008.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f91fb8b3aed602e091b89e35fff7432d4573f5ccbdc2d5d9d19daabbacc3a5b
3
+ size 508874456
data/data-00004-of-00008.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb7f52083e24ef80fc12bca90c88aac3159d5479af9f29de4a7740dacb85d9d6
3
+ size 501644912
data/data-00005-of-00008.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f77f69ede7a456f8f59873e482bf381626532b945e50b497b91834d814ba8ce2
3
+ size 501508320
data/data-00006-of-00008.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:762b9acd3aa46c27b40af2a681b638e6ac345fbb508360c68f815e284d61f584
3
+ size 470994224
data/data-00007-of-00008.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58b898027e4506c6a7a0afb9bdc5e61c8df13a8d5089a68383a457dc6dea1b72
3
+ size 497639288
data/dataset_info.json CHANGED
@@ -3,59 +3,43 @@
3
  "citation": "",
4
  "config_name": "default",
5
  "dataset_name": "pathfinder_arxiv_data",
6
- "dataset_size": 5770056875,
7
  "description": "",
8
  "download_checksums": {
9
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00000-of-00012.parquet": {
10
- "num_bytes": 384481705,
11
  "checksum": null
12
  },
13
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00001-of-00012.parquet": {
14
- "num_bytes": 383347319,
15
  "checksum": null
16
  },
17
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00002-of-00012.parquet": {
18
- "num_bytes": 383133689,
19
  "checksum": null
20
  },
21
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00003-of-00012.parquet": {
22
- "num_bytes": 384399351,
23
  "checksum": null
24
  },
25
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00004-of-00012.parquet": {
26
- "num_bytes": 382810245,
27
  "checksum": null
28
  },
29
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00005-of-00012.parquet": {
30
- "num_bytes": 382870394,
31
  "checksum": null
32
  },
33
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00006-of-00012.parquet": {
34
- "num_bytes": 364849142,
35
  "checksum": null
36
  },
37
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00007-of-00012.parquet": {
38
- "num_bytes": 363965178,
39
- "checksum": null
40
- },
41
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00008-of-00012.parquet": {
42
- "num_bytes": 376639054,
43
- "checksum": null
44
- },
45
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00009-of-00012.parquet": {
46
- "num_bytes": 384035100,
47
- "checksum": null
48
- },
49
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00010-of-00012.parquet": {
50
- "num_bytes": 355126903,
51
- "checksum": null
52
- },
53
- "hf://datasets/kiyer/pathfinder_arxiv_data@66fc52fb3d7d82779c3d73b0cb0c14218cb02e63/data/train-00011-of-00012.parquet": {
54
- "num_bytes": 359912183,
55
  "checksum": null
56
  }
57
  },
58
- "download_size": 4505570263,
59
  "features": {
60
  "ads_id": {
61
  "dtype": "string",
@@ -156,25 +140,22 @@
156
  },
157
  "homepage": "",
158
  "license": "",
159
- "size_in_bytes": 10275627138,
160
  "splits": {
161
  "train": {
162
  "name": "train",
163
- "num_bytes": 5770056875,
164
- "num_examples": 499142,
165
  "shard_lengths": [
166
- 42596,
167
- 43596,
168
- 43595,
169
- 42595,
170
- 43595,
171
- 43595,
172
- 46595,
173
- 44595,
174
- 43595,
175
- 43595,
176
- 43595,
177
- 17595
178
  ],
179
  "dataset_name": "pathfinder_arxiv_data"
180
  }
 
3
  "citation": "",
4
  "config_name": "default",
5
  "dataset_name": "pathfinder_arxiv_data",
6
+ "dataset_size": 4065510154,
7
  "description": "",
8
  "download_checksums": {
9
+ "hf://datasets/kiyer/pathfinder_arxiv_data@1a8eaa7eef5a503386a1487e20f13bedba605245/data/train-00000-of-00008.parquet": {
10
+ "num_bytes": 406754152,
11
  "checksum": null
12
  },
13
+ "hf://datasets/kiyer/pathfinder_arxiv_data@1a8eaa7eef5a503386a1487e20f13bedba605245/data/train-00001-of-00008.parquet": {
14
+ "num_bytes": 405109745,
15
  "checksum": null
16
  },
17
+ "hf://datasets/kiyer/pathfinder_arxiv_data@1a8eaa7eef5a503386a1487e20f13bedba605245/data/train-00002-of-00008.parquet": {
18
+ "num_bytes": 405466052,
19
  "checksum": null
20
  },
21
+ "hf://datasets/kiyer/pathfinder_arxiv_data@1a8eaa7eef5a503386a1487e20f13bedba605245/data/train-00003-of-00008.parquet": {
22
+ "num_bytes": 406784839,
23
  "checksum": null
24
  },
25
+ "hf://datasets/kiyer/pathfinder_arxiv_data@1a8eaa7eef5a503386a1487e20f13bedba605245/data/train-00004-of-00008.parquet": {
26
+ "num_bytes": 404752067,
27
  "checksum": null
28
  },
29
+ "hf://datasets/kiyer/pathfinder_arxiv_data@1a8eaa7eef5a503386a1487e20f13bedba605245/data/train-00005-of-00008.parquet": {
30
+ "num_bytes": 404624503,
31
  "checksum": null
32
  },
33
+ "hf://datasets/kiyer/pathfinder_arxiv_data@1a8eaa7eef5a503386a1487e20f13bedba605245/data/train-00006-of-00008.parquet": {
34
+ "num_bytes": 392634525,
35
  "checksum": null
36
  },
37
+ "hf://datasets/kiyer/pathfinder_arxiv_data@1a8eaa7eef5a503386a1487e20f13bedba605245/data/train-00007-of-00008.parquet": {
38
+ "num_bytes": 397101125,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  "checksum": null
40
  }
41
  },
42
+ "download_size": 3223227008,
43
  "features": {
44
  "ads_id": {
45
  "dtype": "string",
 
140
  },
141
  "homepage": "",
142
  "license": "",
143
+ "size_in_bytes": 7288737162,
144
  "splits": {
145
  "train": {
146
  "name": "train",
147
+ "num_bytes": 4065510154,
148
+ "num_examples": 352194,
149
  "shard_lengths": [
150
+ 43000,
151
+ 43025,
152
+ 43025,
153
+ 43024,
154
+ 44024,
155
+ 44024,
156
+ 46024,
157
+ 44024,
158
+ 2024
 
 
 
159
  ],
160
  "dataset_name": "pathfinder_arxiv_data"
161
  }
data/state.json CHANGED
@@ -1,43 +1,31 @@
1
  {
2
  "_data_files": [
3
  {
4
- "filename": "data-00000-of-00012.arrow"
5
  },
6
  {
7
- "filename": "data-00001-of-00012.arrow"
8
  },
9
  {
10
- "filename": "data-00002-of-00012.arrow"
11
  },
12
  {
13
- "filename": "data-00003-of-00012.arrow"
14
  },
15
  {
16
- "filename": "data-00004-of-00012.arrow"
17
  },
18
  {
19
- "filename": "data-00005-of-00012.arrow"
20
  },
21
  {
22
- "filename": "data-00006-of-00012.arrow"
23
  },
24
  {
25
- "filename": "data-00007-of-00012.arrow"
26
- },
27
- {
28
- "filename": "data-00008-of-00012.arrow"
29
- },
30
- {
31
- "filename": "data-00009-of-00012.arrow"
32
- },
33
- {
34
- "filename": "data-00010-of-00012.arrow"
35
- },
36
- {
37
- "filename": "data-00011-of-00012.arrow"
38
  }
39
  ],
40
- "_fingerprint": "10a80a75c30e04f8",
41
  "_format_columns": null,
42
  "_format_kwargs": {},
43
  "_format_type": null,
 
1
  {
2
  "_data_files": [
3
  {
4
+ "filename": "data-00000-of-00008.arrow"
5
  },
6
  {
7
+ "filename": "data-00001-of-00008.arrow"
8
  },
9
  {
10
+ "filename": "data-00002-of-00008.arrow"
11
  },
12
  {
13
+ "filename": "data-00003-of-00008.arrow"
14
  },
15
  {
16
+ "filename": "data-00004-of-00008.arrow"
17
  },
18
  {
19
+ "filename": "data-00005-of-00008.arrow"
20
  },
21
  {
22
+ "filename": "data-00006-of-00008.arrow"
23
  },
24
  {
25
+ "filename": "data-00007-of-00008.arrow"
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
  ],
28
+ "_fingerprint": "216019f3026e4d55",
29
  "_format_columns": null,
30
  "_format_kwargs": {},
31
  "_format_type": null,
prompts.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ react_prompt = """You are an expert astronomer and cosmologist.
2
+ Answer the following question as best you can using information from the library, but speaking in a concise and factual manner.
3
+ If you can not come up with an answer, say you do not know.
4
+ Try to break the question down into smaller steps and solve it in a logical manner.
5
+
6
+ You have access to the following tools:
7
+
8
+ {tools}
9
+
10
+ Use the following format:
11
+
12
+ Question: the input question you must answer
13
+ Thought: you should always think about what to do
14
+ Action: the action to take, should be one of [{tool_names}]
15
+ Action Input: the input to the action
16
+ Observation: the result of the action
17
+ ... (this Thought/Action/Action Input/Observation can repeat N times)
18
+ Thought: I now know the final answer
19
+ Final Answer: the final answer to the original input question. provide information about how you arrived at the answer, and any nuances or uncertainties the reader should be aware of
20
+
21
+ Begin! Remember to speak in a pedagogical and factual manner."
22
+
23
+ Question: {input}
24
+ Thought:{agent_scratchpad}"""
25
+
26
+ regular_prompt = """You are an expert astronomer and cosmologist.
27
+ Answer the following question as best you can using information from the library, but speaking in a concise and factual manner.
28
+ If you can not come up with an answer, say you do not know.
29
+ Try to break the question down into smaller steps and solve it in a logical manner.
30
+
31
+ Provide information about how you arrived at the answer, and any nuances or uncertainties the reader should be aware of.
32
+
33
+ Begin! Remember to speak in a pedagogical and factual manner."
34
+
35
+ Relevant documents:{context}
36
+
37
+ Question: {question}
38
+ Answer:"""
39
+
40
+ bibliometric_prompt = """You are an AI assistant with expertise in astronomy and astrophysics literature. Your task is to assist with relevant bibliometric information in response to a user question. The user question may consist of identifying key papers, authors, or trends in a specific area of astronomical research.
41
+
42
+ Depending on what the user wants, direct them to consult the NASA Astrophysics Data System (ADS) at https://ui.adsabs.harvard.edu/. Provide them with the recommended ADS query depending on their question.
43
+
44
+ Here's a more detailed guide on how to use NASA ADS for various types of queries:
45
+
46
+ Basic topic search: Enter keywords in the search bar, e.g., "exoplanets". Use quotation marks for exact phrases, e.g., "dark energy”
47
+ Author search: Use the syntax author:"Last Name, First Name", e.g., author:"Hawking, S". For papers by multiple authors, use AND, e.g., author:"Hawking, S" AND author:"Ellis, G"
48
+ Date range: Use year:YYYY-YYYY, e.g., year:2010-2020. For papers since a certain year, use year:YYYY-, e.g., year:2015-
49
+ 4.Combining search terms: Use AND, OR, NOT operators, e.g., "black holes" AND (author:"Hawking, S" OR author:"Penrose, R")
50
+ Filtering results: Use the left sidebar to filter by publication year, article type, or astronomy database
51
+ Sorting results: Use the "Sort" dropdown menu to order by options like citation count, publication date, or relevance
52
+ Advanced searches: Click on the "Search" dropdown menu and select "Classic Form" for field-specific searchesUse bibcode:YYYY for a specific journal/year, e.g., bibcode:2020ApJ to find all Astrophysical Journal papers from 2020
53
+ Finding review articles: Wrap the query in the reviews() operator (e.g. reviews(“dark energy”))
54
+ Excluding preprints: Add NOT doctype:"eprint" to your search
55
+ Citation metrics: Click on the citation count of a paper to see its citation history and who has cited it
56
+
57
+ Some examples:
58
+
59
+ Example 1:
60
+ “How many papers published in 2022 used data from MAST missions?”
61
+ Your response should be: year:2022 data:"MAST"
62
+
63
+ Example 2:
64
+ “What are the most cited papers on spiral galaxy halos measured in X-rays, with publication date from 2010 to 2023?
65
+ Your response should be: "spiral galaxy halos" AND "x-ray" year:2010-2024
66
+
67
+ Example 3:
68
+ “Can you list 3 papers published by “< name>” as first author?”
69
+ Your response should be: author: “^X”
70
+
71
+ Example 4:
72
+ “Based on papers with “<name>” as an author or co-author, can you suggest the five most recent astro-ph papers that would be relevant?”
73
+ Your response should be:
74
+
75
+ Remember to advise users that while these examples cover many common scenarios, NASA ADS has many more advanced features that can be explored through its documentation.
76
+
77
+ Relevant documents:{context}
78
+ Question: {question}
79
+
80
+ Response:"""
81
+
82
+ single_paper_prompt = """You are an astronomer with access to a vast database of astronomical facts and figures. Your task is to provide a concise, accurate answer to the following specific factual question about astronomy or astrophysics.
83
+ Provide the requested information clearly and directly. If relevant, include the source of your information or any recent updates to this fact. If there's any uncertainty or variation in the accepted value, briefly explain why.
84
+ If the question can't be answered with a single fact, provide a short, focused explanation. Always prioritize accuracy over speculation.
85
+ Relevant documents:{context}
86
+ Question: {question}
87
+ Response:"""
88
+
89
+ deep_knowledge_prompt = """You are an expert astronomer with deep knowledge across various subfields of astronomy and astrophysics. Your task is to provide a comprehensive and nuanced answer to the following question, which involves an unresolved topic or requires broad, common-sense understanding.
90
+ Consider multiple perspectives and current debates in the field. Explain any uncertainties or ongoing research. If relevant, mention how this topic connects to other areas of astronomy.
91
+ Provide your response in a clear, pedagogical manner, breaking down complex concepts for easier understanding. If appropriate, suggest areas where further research might be needed.
92
+ After formulating your initial response, take a moment to reflect on your answer. Consider:
93
+ 1. Have you addressed all aspects of the question?
94
+ 2. Are there any potential biases or assumptions in your explanation?
95
+ 3. Is your explanation clear and accessible to someone with a general science background?
96
+ 4. Have you adequately conveyed the uncertainties or debates surrounding this topic?
97
+ Based on this reflection, refine your answer as needed.
98
+ Remember, while you have extensive knowledge, it's okay to acknowledge the limits of current scientific understanding. If parts of the question cannot be answered definitively, explain why.
99
+ Relevant documents:{context}
100
+
101
+ Question: {question}
102
+
103
+ Initial Response:
104
+ [Your initial response here]
105
+
106
+ Reflection and Refinement:
107
+ [Your reflections and any refinements to your answer here]
108
+
109
+ Final Response:
110
+ [Your final, refined answer here]"""
111
+
112
+ question_categorization_prompt = """You are an expert astrophysicist and computer scientist specializing in linguistics and semantics. Your task is to categorize a given query into one of the following categories:
113
+
114
+ 1. Summarization
115
+ 2. Single-paper factual
116
+ 3. Multi-paper factual
117
+ 4. Named entity recognition
118
+ 5. Jargon-specific questions / overloaded words
119
+ 6. Time-sensitive
120
+ 7. Consensus evaluation
121
+ 8. What-ifs and counterfactuals
122
+ 9. Compositional
123
+
124
+ Analyze the query carefully, considering its content, structure, and implications. Then, determine which of the above categories best fits the query.
125
+
126
+ In your analysis, consider the following:
127
+ - Does the query ask for a well-known datapoint or mechanism?
128
+ - Can it be answered by a single paper or does it require multiple sources?
129
+ - Does it involve proper nouns or specific scientific terms?
130
+ - Is it time-dependent or likely to change in the near future?
131
+ - Does it require evaluating consensus across multiple sources?
132
+ - Is it a hypothetical or counterfactual question?
133
+ - Does it need to be broken down into sub-queries (i.e. compositional)?
134
+
135
+ After your analysis, categorize the query into one of the nine categories listed above.
136
+
137
+ Provide a brief explanation for your categorization, highlighting the key aspects of the query that led to your decision.
138
+
139
+ Present your final answer in the following format:
140
+
141
+ <categorization>
142
+ Category: [Selected category]
143
+ Explanation: [Your explanation for the categorization]
144
+ </categorization>"""