bluuebunny commited on
Commit
14a2693
·
verified ·
1 Parent(s): d6e883c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import os
3
+ import zipfile
4
+ from glob import glob
5
+ import shutil
6
+ from huggingface_hub import HfApi
7
+ import gradio as gr
8
+ from tqdm.auto import tqdm
9
+ import threading
10
+
11
+
12
+ ################################################################################
13
+
14
+ # Declarations:
15
+ print("Declaring variables.")
16
+ # AWS S3 service name
17
+ service_name = 's3'
18
+
19
+ # AWS S3 bucket names
20
+ biorxiv_bucket_name = 'biorxiv-src-monthly'
21
+ medrxiv_bucket_name = 'medrxiv-src-monthly'
22
+
23
+ # AWS region name
24
+ region_name = 'us-east-1'
25
+
26
+ # Hugging Face destination repository name
27
+ destination_repo_name = 'xml-dump-monthly'
28
+
29
+ ################################################################################
30
+
31
+ print("Initiating clients.")
32
+
33
+ # Create a S3 client
34
+ s3_client = boto3.client(
35
+ service_name='s3',
36
+ region_name=region_name,
37
+ aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
38
+ aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
39
+ )
40
+ paginator = s3_client.get_paginator('list_objects_v2')
41
+
42
+ # Create a Hugging Face API client
43
+ access_token = os.getenv('HF_API_KEY')
44
+ hugging_face_api = HfApi(token=access_token)
45
+
46
+ # Create a dataset repo
47
+ hugging_face_api.create_repo(
48
+ repo_id=destination_repo_name,
49
+ repo_type="dataset",
50
+ private=False,
51
+ exist_ok=True
52
+ )
53
+
54
+ # Extract Hugging facec username
55
+ username = hugging_face_api.whoami()['name']
56
+ repo_id = f"{username}/{destination_repo_name}"
57
+
58
+ ################################################################################
59
+
60
+ def download_biorxiv(Prefix=""):
61
+
62
+ print("Downloading Biorxiv files.")
63
+
64
+ # Output folders for downloaded files
65
+ biorxiv_output_folder = Prefix + 'biorxiv-xml-dump'
66
+
67
+ # Create output folders if they don't exist
68
+ os.makedirs(biorxiv_output_folder, exist_ok=True)
69
+
70
+ # Gather all objects from Biorxiv bucket
71
+ biorxiv_pages = paginator.paginate(
72
+ Bucket=biorxiv_bucket_name,
73
+ RequestPayer='requester',
74
+ Prefix=Prefix
75
+ ).build_full_result()
76
+
77
+ # Dowload all objects from Biorxiv bucket
78
+ for biorxiv_object in tqdm(biorxiv_pages['Contents'], desc=Prefix):
79
+
80
+ # Get the file name
81
+ file = biorxiv_object['Key']
82
+
83
+ # Check if the file is a zip file
84
+ if file.endswith(".meca"):
85
+
86
+ # Proccess the zip file
87
+ try:
88
+
89
+ # Download the file
90
+ s3_client.download_file(biorxiv_bucket_name, file, 'tmp_bio.meca', ExtraArgs={'RequestPayer':'requester'})
91
+
92
+ # Unzip meca file
93
+ with zipfile.ZipFile('tmp_bio.meca', 'r') as zip_ref:
94
+ zip_ref.extractall("tmp_bio")
95
+
96
+ # Gather the xml file
97
+ xml = glob('tmp_bio/content/*.xml')
98
+
99
+ # Copy the xml file to the output folder
100
+ shutil.copy(xml[0], biorxiv_output_folder)
101
+
102
+ # Remove the tmp_bio folder and file
103
+ shutil.rmtree('tmp_bio')
104
+ os.remove('tmp_bio.meca')
105
+
106
+ except Exception as e:
107
+
108
+ print(f"Error processing file {file}: {e}")
109
+
110
+
111
+ # Zip the output folder
112
+ shutil.make_archive(biorxiv_output_folder, 'zip', biorxiv_output_folder)
113
+
114
+ # Upload the zip files to Hugging Face
115
+ print(f"Uploading {biorxiv_output_folder}.zip to Hugging Face repo {repo_id}.")
116
+ hugging_face_api.upload_file(path_or_fileobj=f'{biorxiv_output_folder}.zip', path_in_repo=f'{biorxiv_output_folder}.zip', repo_id=repo_id, repo_type="dataset")
117
+
118
+ print("Biorxiv Done.")
119
+
120
+
121
+
122
+ # Create separate threads function
123
+ first_thread2 = threading.Thread(target=download_biorxiv, args=("Current_Content/October_2024/",))
124
+
125
+ # Start thread
126
+ first_thread2.start()
127
+
128
+
129
+ ###############################################################################
130
+
131
+ # Dummy app
132
+
133
+ def greet(name, intensity):
134
+ return "Hello, " + name + "!" * int(intensity)
135
+
136
+ demo = gr.Interface(
137
+ fn=greet,
138
+ inputs=["text", "slider"],
139
+ outputs=["text"],
140
+ )
141
+
142
+ demo.launch()
143
+