Sarkosos commited on
Commit
14285d3
1 Parent(s): 5144f34

added plots for total jobs done and unique proteins folded

Browse files
Files changed (4) hide show
  1. api.py +3 -3
  2. app.py +41 -24
  3. classes.py +6 -1
  4. utils.py +5 -9
api.py CHANGED
@@ -8,7 +8,7 @@ import utils
8
  import pandas as pd
9
  import uvicorn
10
 
11
- from classes import Productivity, ProductivityData, Throughput
12
 
13
 
14
  # Global variables (saves time on loading data)
@@ -64,9 +64,9 @@ def productivity_metrics():
64
  # Unpack the metrics using the correct keys
65
  result = utils.get_productivity(df_all=data_all, df_24h=data_24h)
66
  all_time = ProductivityData(**result['all_time'])
67
- last_24h = ProductivityData(**result['last_24h'])
68
 
69
- return Productivity(all_time=all_time, last_24h=last_24h)
70
 
71
 
72
  @app.get("/throughput", response_model=Throughput)
 
8
  import pandas as pd
9
  import uvicorn
10
 
11
+ from classes import Productivity, ProductivityData, Last24hProductivityData, Throughput
12
 
13
 
14
  # Global variables (saves time on loading data)
 
64
  # Unpack the metrics using the correct keys
65
  result = utils.get_productivity(df_all=data_all, df_24h=data_24h)
66
  all_time = ProductivityData(**result['all_time'])
67
+ last_24h = Last24hProductivityData(**result['last_24h'])
68
 
69
+ return {"all_time": all_time, "last_24h": last_24h}
70
 
71
 
72
  @app.get("/throughput", response_model=Throughput)
app.py CHANGED
@@ -31,39 +31,56 @@ st.subheader('Productivity overview')
31
  st.info('Productivity metrics show how many proteins have been folded, which is the primary goal of the subnet. Metrics are estimated using weights and biases data combined with heuristics.')
32
 
33
  productivity_all = requests.get(f'{BASE_URL}/productivity').json()
34
- productivity = productivity_all['all_time']
 
35
  productivity_24h = productivity_all['last_24h']
 
 
36
 
37
- # st.write(productivity_all)
38
- # # st.write(productivity)
39
- # st.write(productivity_24h)
40
 
41
  m1, m2 = st.columns(2)
 
 
42
 
43
- m1.metric('Unique proteins folded', f'{productivity.get("unique_folded", 0):,.0f}', delta=f'{productivity_24h.get("unique_folded", 0):,.0f} (24h)')
44
- m2.metric('Total jobs completed', f'{productivity.get("total_completed_jobs", 0):,.0f}', delta=f'{productivity_24h.get("total_completed_jobs", 0):,.0f} (24h)')
45
 
46
- # m3.metric('Total simulation steps', f'{productivity.get("total_md_steps"):,.0f}', delta=f'{productivity_24h.get("total_md_steps"):,.0f} (24h)')
47
 
48
- # st.markdown('<br>', unsafe_allow_html=True)
49
 
50
- # time_binned_data = df.set_index('last_event_at').groupby(pd.Grouper(freq='12h'))
 
51
 
52
- # PROD_CHOICES = {
53
- # 'Unique proteins folded': 'unique_pdbs',
54
- # 'Total simulations': 'total_pdbs',
55
- # 'Total simulation steps': 'total_md_steps',
56
- # }
57
- # prod_choice_label = st.radio('Select productivity metric', list(PROD_CHOICES.keys()), index=0, horizontal=True)
58
- # prod_choice = PROD_CHOICES[prod_choice_label]
59
- # steps_running_total = time_binned_data[prod_choice].sum().cumsum()
60
- # st.plotly_chart(
61
- # # add fillgradient to make it easier to see the trend
62
- # px.area(steps_running_total, y=prod_choice,
63
- # labels={'last_event_at':'', prod_choice: prod_choice_label},
64
- # ).update_traces(fill='tozeroy'),
65
- # use_container_width=True,
66
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  st.markdown('<br>', unsafe_allow_html=True)
69
 
 
31
  st.info('Productivity metrics show how many proteins have been folded, which is the primary goal of the subnet. Metrics are estimated using weights and biases data combined with heuristics.')
32
 
33
  productivity_all = requests.get(f'{BASE_URL}/productivity').json()
34
+ completed_jobs = productivity_all['all_time']['total_completed_jobs']
35
+
36
  productivity_24h = productivity_all['last_24h']
37
+ completed_jobs = pd.DataFrame(completed_jobs)
38
+ completed_jobs['last_event_at'] = pd.to_datetime(completed_jobs['updated_at'])
39
 
40
+ unique_folded = completed_jobs.drop_duplicates(subset=['pdb_id'], keep='first')
41
+ unique_folded['last_event_at'] = pd.to_datetime(unique_folded['updated_at'])
 
42
 
43
  m1, m2 = st.columns(2)
44
+ m1.metric('Unique proteins folded', f'{len(unique_folded):,.0f}', delta=f'{productivity_24h["unique_folded"]:,.0f} (24h)')
45
+ m2.metric('Total jobs completed', f'{len(completed_jobs):,.0f}', delta=f'{productivity_24h["total_completed_jobs"]:,.0f} (24h)')
46
 
47
+ st.markdown('<br>', unsafe_allow_html=True)
 
48
 
 
49
 
 
50
 
51
+ # time_binned_data_complete = completed_jobs.set_index('last_event_at').groupby(pd.Grouper(freq='12h'))
52
+ # time_binned_data_unique = unique_folded.set_index('last_event_at').groupby(pd.Grouper(freq='12h'))
53
 
54
+ PROD_CHOICES = {
55
+ 'Unique proteins folded': 'unique_pdbs',
56
+ 'Total simulations': 'total_pdbs',
57
+ }
58
+
59
+
60
+
61
+ prod_choice_label = st.radio('Select productivity metric', list(PROD_CHOICES.keys()), index=0, horizontal=True)
62
+ prod_choice = PROD_CHOICES[prod_choice_label]
63
+
64
+ PROD_DATA = {
65
+ 'unique_pdbs': unique_folded,
66
+ 'total_pdbs': completed_jobs,
67
+ }
68
+ df = PROD_DATA[prod_choice]
69
+
70
+ df = df.sort_values(by='last_event_at').reset_index()
71
+
72
+ # Create a cumulative count column
73
+ df['cumulative_jobs'] = df.index + 1
74
+
75
+ # Plot the cumulative jobs over time
76
+
77
+ st.plotly_chart(
78
+ # add fillgradient to make it easier to see the trend
79
+ px.line(df, x='last_event_at', y='cumulative_jobs',
80
+ title='Total Jobs Completed Over Time',
81
+ labels={'last_event_at': 'Time', 'cumulative_jobs': 'Total Jobs Completed'}).update_traces(fill='tozeroy'),
82
+ use_container_width=True,
83
+ )
84
 
85
  st.markdown('<br>', unsafe_allow_html=True)
86
 
classes.py CHANGED
@@ -1,12 +1,17 @@
1
  from pydantic import BaseModel
2
 
3
  class ProductivityData(BaseModel):
 
 
 
 
 
4
  unique_folded: int
5
  total_completed_jobs: int
6
 
7
  class Productivity(BaseModel):
8
  all_time: ProductivityData
9
- last_24h: ProductivityData
10
 
11
  class ThroughputData(BaseModel):
12
  validator_sent: float
 
1
  from pydantic import BaseModel
2
 
3
  class ProductivityData(BaseModel):
4
+ total_completed_jobs: dict[str, dict[int, str]]
5
+
6
+
7
+
8
+ class Last24hProductivityData(BaseModel):
9
  unique_folded: int
10
  total_completed_jobs: int
11
 
12
  class Productivity(BaseModel):
13
  all_time: ProductivityData
14
+ last_24h: Last24hProductivityData
15
 
16
  class ThroughputData(BaseModel):
17
  validator_sent: float
utils.py CHANGED
@@ -164,7 +164,6 @@ def get_data_transferred(df, unit='GB'):
164
  def get_productivity(df_all, df_24h):
165
  result = {
166
  'all_time': {
167
- 'unique_folded': 0,
168
  'total_completed_jobs': 0
169
  },
170
  'last_24h': {
@@ -173,19 +172,16 @@ def get_productivity(df_all, df_24h):
173
  }
174
  }
175
  if df_all is not None:
176
- unique_folded_all = len(df_all.pdb_id.value_counts())
177
- completed_jobs_all = len(df_all[df_all.active == False])
178
-
179
- total_historical_run_updates = df_all.active.isna().sum()
180
- total_historical_completed_jobs = total_historical_run_updates//10 # this is an estimate based on minimum number of updates per pdb
181
 
182
  result['all_time'].update({
183
- 'unique_folded': unique_folded_all,
184
- 'total_completed_jobs': (completed_jobs_all + total_historical_completed_jobs).item(),
185
  })
186
 
187
  if df_24h is not None:
188
- completed_jobs_24h = df_24h[df_24h['updated_count'] >= 10]
189
  unique_completed_jobs_24h = completed_jobs_24h.drop_duplicates(subset=['pdb_id'], keep='first')
190
  result['last_24h'].update({
191
  'unique_folded': len(unique_completed_jobs_24h),
 
164
  def get_productivity(df_all, df_24h):
165
  result = {
166
  'all_time': {
 
167
  'total_completed_jobs': 0
168
  },
169
  'last_24h': {
 
172
  }
173
  }
174
  if df_all is not None:
175
+
176
+
177
+ completed_jobs = df_all[df_all['updated_count'] == 10]
 
 
178
 
179
  result['all_time'].update({
180
+ 'total_completed_jobs': completed_jobs[["updated_at", "pdb_id"]].to_dict(),
 
181
  })
182
 
183
  if df_24h is not None:
184
+ completed_jobs_24h = df_24h[df_24h['updated_count'] == 10]
185
  unique_completed_jobs_24h = completed_jobs_24h.drop_duplicates(subset=['pdb_id'], keep='first')
186
  result['last_24h'].update({
187
  'unique_folded': len(unique_completed_jobs_24h),