gamingflexer commited on
Commit
69151cb
1 Parent(s): 9269cc3

Refactor get_paper_details_batch method to store metadata

Browse files
Files changed (1) hide show
  1. src/scrapper/main.py +6 -3
src/scrapper/main.py CHANGED
@@ -34,16 +34,19 @@ class ArxivPaper:
34
 
35
  def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
36
  path_author = os.path.join(path, self.author_name.replace(" ", "_"))
37
- data = {}
38
  for i in tqdm(paper_ids):
39
  try:
40
  paper = Arxiv(i)
41
  paper.load(path_author)
42
- paper.get_meta()
43
  refs = paper.get_refs(
44
  extractor=self.extractor,
45
  text_splitter=self.text_splitter,)
46
  paper.chunker()
47
  paper.save_chunks(include_metadata=True, path=path_author)
 
48
  except Exception as e:
49
- print(f"Error processing paper {i}: {e}")
 
 
 
34
 
35
  def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
36
  path_author = os.path.join(path, self.author_name.replace(" ", "_"))
37
+ data = []
38
  for i in tqdm(paper_ids):
39
  try:
40
  paper = Arxiv(i)
41
  paper.load(path_author)
42
+ metadata = paper.get_meta()
43
  refs = paper.get_refs(
44
  extractor=self.extractor,
45
  text_splitter=self.text_splitter,)
46
  paper.chunker()
47
  paper.save_chunks(include_metadata=True, path=path_author)
48
+ data.append(metadata)
49
  except Exception as e:
50
+ print(f"Error processing paper {i}: {e}")
51
+ return data
52
+