lysandre's picture
lysandre HF staff
Update repository statistics
6090892
import collections
import os
from datetime import datetime, timedelta
import json
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
from urllib.parse import parse_qs, urlparse
from huggingface_hub import list_datasets, login, HfFolder
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np
HF_TOKEN = os.environ['HF_TOKEN']
login(HF_TOKEN)
HfFolder.save_token(HF_TOKEN)
datasets = {
"stars": load_dataset("open-source-metrics/preprocessed_stars"),
"issues": load_dataset("open-source-metrics/preprocessed_issues"),
"pip": load_dataset("open-source-metrics/preprocessed_pip").sort('day'),
}
external_datasets = {
"pip": load_dataset("open-source-metrics/pip-external").sort('day'),
"stars": load_dataset("open-source-metrics/stars-external"),
"issues": load_dataset("open-source-metrics/issues-external")
}
external_datasets['pip']['openai_python'] = external_datasets['pip']['openai']
del external_datasets['pip']['openai']
def cut_output(full_output: Dataset, library_names: list):
output = full_output.to_dict().items()
output = {k: v + [None] for k, v in output if k in library_names + ['day']}
last_value = max(output[k].index(None) for k in output.keys() if k != 'day')
return {k: v[:last_value] for k, v in output.items()}
def parse_name_and_options(path):
url = urlparse(path)
query = parse_qs(url.query)
library_names = query.get("input", None)[0]
library_names = library_names.split(',')
options = query.get("options", None)[0]
options = options.split(',')
return library_names, options
def sum_of_lists(lists):
def _sum(items):
while None in items:
items.remove(None)
return sum(items)
return [_sum(list(a)) for a in zip(*lists)]
class RequestHandler(SimpleHTTPRequestHandler):
def do_GET(self):
print(self.path)
if self.path == "/":
self.path = "index.html"
return SimpleHTTPRequestHandler.do_GET(self)
if self.path.startswith("/initialize"):
dataset_with_most_splits = max(datasets['stars'].column_names.values(), key=len)
if 'day' in dataset_with_most_splits:
dataset_with_most_splits.remove('day')
external_dataset_keys = {k: set(v.keys()) for k, v in external_datasets.items()}
external_dataset_with_most_splits = max([d for d in external_dataset_keys.values()], key=len)
for external in external_dataset_with_most_splits:
dataset_with_most_splits.remove(external)
warnings = []
print("Initializing ...")
for k, v in external_dataset_keys.items():
if len(v) < len(external_dataset_with_most_splits):
warnings.append(
f"The {k} external dataset does not contain all splits. Missing: {external_dataset_with_most_splits - v}"
f".\nSelecting that split to show the pip install numbers will not work."
)
dataset_with_most_splits = list(dataset_with_most_splits)
dataset_with_most_splits.sort()
external_dataset_with_most_splits = list(external_dataset_with_most_splits)
external_dataset_with_most_splits.sort()
res = {
'internal': dataset_with_most_splits,
'external': external_dataset_with_most_splits,
'warnings': warnings
}
print(f"Returning: {res}")
return self.response(res)
if self.path.startswith("/retrievePipInstalls"):
errors = []
library_names, options = parse_name_and_options(self.path)
cumulated = '1' in options
week_over_week = '2' in options
if week_over_week:
if cumulated:
cumulated_dict = {
'Cumulated': sum_of_lists([v for k, v in datasets['pip']['wow'].to_dict().items() if k in library_names]),
'day': datasets['pip']['wow'].to_dict()['day']
}
return self.response(cumulated_dict)
else:
return self.response({k: v for k, v in datasets['pip']['wow'].to_dict().items() if k in library_names + ['day']})
else:
if cumulated:
cumulated_dict = {
'Cumulated': sum_of_lists([v for k, v in datasets['pip']['raw'].to_dict().items() if k in library_names]),
'day': datasets['pip']['raw'].to_dict()['day']
}
return self.response(cumulated_dict)
else:
return self.response({k: v for k, v in datasets['pip']['raw'].to_dict().items() if k in library_names + ['day']})
if self.path.startswith("/retrieveStars"):
library_names, options = parse_name_and_options(self.path)
week_over_week = '1' in options
cumulated = '2' in options
if week_over_week:
if cumulated:
cumulated_dict = {
'Cumulated': sum_of_lists([v for k, v in datasets['stars']['wow'].to_dict().items() if k in library_names]),
'day': datasets['stars']['wow'].to_dict()['day']
}
return self.response(cumulated_dict)
else:
return self.response({k: v for k, v in datasets['stars']['wow'].to_dict().items() if k in library_names + ['day']})
else:
if cumulated:
cumulated_dict = {
'Cumulated': sum_of_lists([v for k, v in datasets['stars']['raw'].to_dict().items() if k in library_names]),
'day': datasets['stars']['raw'].to_dict()['day']
}
return self.response(cumulated_dict)
else:
return self.response({k: v for k, v in datasets['stars']['raw'].to_dict().items() if k in library_names + ['day']})
if self.path.startswith("/retrieveIssues"):
library_names, options = parse_name_and_options(self.path)
exclude_org_members = '1' in options
week_over_week = '2' in options
cumulated = '3' in options
if week_over_week:
if exclude_org_members:
if cumulated:
cumulated_dict = {
'Cumulated': sum_of_lists([v for k, v in datasets['issues']['eom_wow'].to_dict().items() if k in library_names]),
'day': datasets['issues']['eom_wow'].to_dict()['day']
}
return self.response(cumulated_dict)
else:
return self.response(cut_output(datasets['issues']['eom_wow'], library_names))
else:
if cumulated:
cumulated_dict = {
'Cumulated': sum_of_lists([v for k, v in datasets['issues']['wow'].to_dict().items() if k in library_names]),
'day': datasets['issues']['wow'].to_dict()['day']
}
return self.response(cumulated_dict)
else:
return self.response({k: v for k, v in datasets['issues']['wow'].to_dict().items() if k in library_names + ['day']})
else:
if exclude_org_members:
return self.response({k: v for k, v in datasets['issues']['eom'].to_dict().items() if k in library_names + ['day']})
else:
return self.response({k: v for k, v in datasets['issues']['raw'].to_dict().items() if k in library_names + ['day']})
return SimpleHTTPRequestHandler.do_GET(self)
def response(self, output):
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(output).encode("utf-8"))
return SimpleHTTPRequestHandler
server = ThreadingHTTPServer(("", 7860), RequestHandler)
print("Running on port 7860")
server.serve_forever()