|
import collections |
|
import os |
|
from datetime import datetime, timedelta |
|
import json |
|
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer |
|
from urllib.parse import parse_qs, urlparse |
|
|
|
from huggingface_hub import list_datasets, set_access_token, HfFolder |
|
from datasets import load_dataset, DatasetDict, Dataset |
|
import numpy as np |
|
|
|
HF_TOKEN = os.environ['HF_TOKEN'] |
|
set_access_token(HF_TOKEN) |
|
HfFolder.save_token(HF_TOKEN) |
|
|
|
|
|
datasets = { |
|
"stars": load_dataset("open-source-metrics/stars").sort('dates'), |
|
"issues": load_dataset("open-source-metrics/issues").sort('dates'), |
|
"pip": load_dataset("open-source-metrics/pip").sort('day') |
|
} |
|
|
|
val = 0 |
|
|
|
|
|
def _range(e): |
|
global val |
|
e['range'] = val |
|
val += 1 |
|
|
|
current_date = datetime.strptime(e['dates'], "%Y-%m-%dT%H:%M:%SZ") |
|
first_date = datetime.fromtimestamp(1) |
|
week = abs(current_date - first_date).days // 7 |
|
e['week'] = week |
|
|
|
return e |
|
|
|
|
|
def _ignore_org_members(e): |
|
global val |
|
e['range_non_org'] = val |
|
|
|
if e['type']['authorAssociation'] != 'MEMBER': |
|
val += 1 |
|
|
|
return e |
|
|
|
stars = {} |
|
for k, v in datasets['stars'].items(): |
|
stars[k] = v.map(_range) |
|
val = 0 |
|
|
|
issues = {} |
|
for k, v in datasets['issues'].items(): |
|
issues[k] = v.map(_range) |
|
val = 0 |
|
issues[k] = issues[k].map(_ignore_org_members) |
|
val = 0 |
|
|
|
datasets['stars'] = DatasetDict(**stars) |
|
datasets['issues'] = DatasetDict(**issues) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def link_values(library_names, returned_values): |
|
previous_values = {library_name: None for library_name in library_names} |
|
for library_name in library_names: |
|
for i in returned_values.keys(): |
|
if library_name not in returned_values[i]: |
|
returned_values[i][library_name] = previous_values[library_name] |
|
else: |
|
previous_values[library_name] = returned_values[i][library_name] |
|
|
|
return returned_values |
|
|
|
|
|
def running_mean(x, N, total_length=-1): |
|
cumsum = np.cumsum(np.insert(x, 0, 0)) |
|
to_pad = max(total_length - len(cumsum), 0) |
|
return np.pad(cumsum[N:] - cumsum[:-N], (to_pad, 0)) / float(N) |
|
|
|
|
|
def parse_name_and_options(path): |
|
url = urlparse(path) |
|
query = parse_qs(url.query) |
|
library_names = query.get("input", None)[0] |
|
library_names = library_names.split(',') |
|
options = query.get("options", None)[0] |
|
options = options.split(',') |
|
|
|
return library_names, options |
|
|
|
|
|
class RequestHandler(SimpleHTTPRequestHandler): |
|
def do_GET(self): |
|
print(self.path) |
|
if self.path == "/": |
|
self.path = "index.html" |
|
|
|
return SimpleHTTPRequestHandler.do_GET(self) |
|
|
|
if self.path.startswith("/initialize"): |
|
dataset_keys = {k: set(v.keys()) for k, v in datasets.items()} |
|
dataset_with_most_splits = max([d for d in dataset_keys.values()], key=len) |
|
warnings = [] |
|
|
|
for k, v in dataset_keys.items(): |
|
if len(v) < len(dataset_with_most_splits): |
|
warnings.extend(f"The {k} dataset does not contain all splits. Missing: {dataset_with_most_splits - v}") |
|
|
|
|
|
dataset_with_most_splits = list(dataset_with_most_splits) |
|
dataset_with_most_splits.sort() |
|
|
|
return self.response(list(dataset_with_most_splits)) |
|
|
|
if self.path.startswith("/retrievePipInstalls"): |
|
library_names, options = parse_name_and_options(self.path) |
|
|
|
if '1' in options: |
|
returned_values = {} |
|
for library_name in library_names: |
|
for i in datasets['pip'][library_name]: |
|
if i['day'] in returned_values: |
|
returned_values[i['day']]['Cumulated'] += i['num_downloads'] |
|
else: |
|
returned_values[i['day']] = {'Cumulated': i['num_downloads']} |
|
|
|
library_names = ['Cumulated'] |
|
|
|
else: |
|
returned_values = {} |
|
for library_name in library_names: |
|
for i in datasets['pip'][library_name]: |
|
if i['day'] in returned_values: |
|
returned_values[i['day']][library_name] = i['num_downloads'] |
|
else: |
|
returned_values[i['day']] = {library_name: i['num_downloads']} |
|
|
|
for library_name in library_names: |
|
for i in returned_values.keys(): |
|
if library_name not in returned_values[i]: |
|
returned_values[i][library_name] = None |
|
|
|
returned_values = collections.OrderedDict(sorted(returned_values.items())) |
|
output = {l: [k[l] for k in returned_values.values()] for l in library_names} |
|
output['day'] = list(returned_values.keys()) |
|
|
|
return self.response(output) |
|
|
|
if self.path.startswith("/retrieveStars"): |
|
library_names, options = parse_name_and_options(self.path) |
|
returned_values = {} |
|
dataset_dict = datasets['stars'] |
|
week_over_week = '1' in options |
|
|
|
for library_name in library_names: |
|
dataset = dataset_dict[library_name] |
|
|
|
last_value = 0 |
|
last_week = dataset[0]['week'] |
|
for i in dataset: |
|
if week_over_week and last_week == i['week']: |
|
continue |
|
if i['dates'] in returned_values: |
|
returned_values[i['dates']][library_name] = i['range'] - last_value |
|
else: |
|
returned_values[i['dates']] = {library_name: i['range'] - last_value} |
|
|
|
last_value = i['range'] if week_over_week else 0 |
|
last_week = i['week'] |
|
|
|
returned_values = collections.OrderedDict(sorted(returned_values.items())) |
|
returned_values = link_values(library_names, returned_values) |
|
output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} |
|
output['day'] = list(returned_values.keys())[::-1] |
|
|
|
|
|
output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()} |
|
|
|
return self.response(output) |
|
|
|
|
|
if self.path.startswith("/retrieveIssues"): |
|
library_names, options = parse_name_and_options(self.path) |
|
|
|
exclude_org_members = '1' in options |
|
week_over_week = '2' in options |
|
|
|
returned_values = {} |
|
dataset_dict = datasets['issues'] |
|
range_id = 'range' if not exclude_org_members else 'range_non_org' |
|
|
|
for library_name in library_names: |
|
dataset = dataset_dict[library_name] |
|
|
|
last_value = 0 |
|
last_week = dataset[0]['week'] |
|
for i in dataset: |
|
if week_over_week and last_week == i['week']: |
|
continue |
|
|
|
if i['dates'] in returned_values: |
|
returned_values[i['dates']][library_name] = i[range_id] - last_value |
|
else: |
|
returned_values[i['dates']] = {library_name: i[range_id] - last_value} |
|
|
|
last_value = i[range_id] if week_over_week else 0 |
|
last_week = i['week'] |
|
|
|
returned_values = collections.OrderedDict(sorted(returned_values.items())) |
|
returned_values = link_values(library_names, returned_values) |
|
output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} |
|
output['day'] = list(returned_values.keys())[::-1] |
|
|
|
|
|
output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()} |
|
|
|
return self.response(output) |
|
|
|
return SimpleHTTPRequestHandler.do_GET(self) |
|
|
|
def response(self, output): |
|
self.send_response(200) |
|
self.send_header("Content-Type", "application/json") |
|
self.end_headers() |
|
|
|
self.wfile.write(json.dumps(output).encode("utf-8")) |
|
|
|
return SimpleHTTPRequestHandler |
|
|
|
|
|
server = ThreadingHTTPServer(("", 7860), RequestHandler) |
|
|
|
print("Running on port 7860") |
|
|
|
server.serve_forever() |
|
|