import collections import os from datetime import datetime, timedelta import json from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer from urllib.parse import parse_qs, urlparse from huggingface_hub import list_datasets, set_access_token, HfFolder from datasets import load_dataset, DatasetDict, Dataset import numpy as np HF_TOKEN = os.environ['HF_TOKEN'] set_access_token(HF_TOKEN) HfFolder.save_token(HF_TOKEN) datasets = { "stars": load_dataset("open-source-metrics/stars").sort('dates'), "issues": load_dataset("open-source-metrics/issues").sort('dates'), "pip": load_dataset("open-source-metrics/pip").sort('day') } val = 0 def _range(e): global val e['range'] = val val += 1 current_date = datetime.strptime(e['dates'], "%Y-%m-%dT%H:%M:%SZ") first_date = datetime.fromtimestamp(1) week = abs(current_date - first_date).days // 7 e['week'] = week return e def _ignore_org_members(e): global val e['range_non_org'] = val if e['type']['authorAssociation'] != 'MEMBER': val += 1 return e stars = {} for k, v in datasets['stars'].items(): stars[k] = v.map(_range) val = 0 issues = {} for k, v in datasets['issues'].items(): issues[k] = v.map(_range) val = 0 issues[k] = issues[k].map(_ignore_org_members) val = 0 datasets['stars'] = DatasetDict(**stars) datasets['issues'] = DatasetDict(**issues) # datasets = { # k1: DatasetDict({ # k2: v2.select(range(0, len(v2), max(1, int(len(v2) / 1000)))) for k2, v2 in v1.items() # }) for k1, v1 in datasets.items() # } def link_values(library_names, returned_values): previous_values = {library_name: None for library_name in library_names} for library_name in library_names: for i in returned_values.keys(): if library_name not in returned_values[i]: returned_values[i][library_name] = previous_values[library_name] else: previous_values[library_name] = returned_values[i][library_name] return returned_values def running_mean(x, N, total_length=-1): cumsum = np.cumsum(np.insert(x, 0, 0)) to_pad = max(total_length - len(cumsum), 0) return np.pad(cumsum[N:] - cumsum[:-N], (to_pad, 0)) / float(N) def parse_name_and_options(path): url = urlparse(path) query = parse_qs(url.query) library_names = query.get("input", None)[0] library_names = library_names.split(',') options = query.get("options", None)[0] options = options.split(',') return library_names, options class RequestHandler(SimpleHTTPRequestHandler): def do_GET(self): print(self.path) if self.path == "/": self.path = "index.html" return SimpleHTTPRequestHandler.do_GET(self) if self.path.startswith("/initialize"): dataset_keys = {k: set(v.keys()) for k, v in datasets.items()} dataset_with_most_splits = max([d for d in dataset_keys.values()], key=len) warnings = [] for k, v in dataset_keys.items(): if len(v) < len(dataset_with_most_splits): warnings.extend(f"The {k} dataset does not contain all splits. Missing: {dataset_with_most_splits - v}") # TODO: Send and display warnings dataset_with_most_splits = list(dataset_with_most_splits) dataset_with_most_splits.sort() return self.response(list(dataset_with_most_splits)) if self.path.startswith("/retrievePipInstalls"): library_names, options = parse_name_and_options(self.path) if '1' in options: returned_values = {} for library_name in library_names: for i in datasets['pip'][library_name]: if i['day'] in returned_values: returned_values[i['day']]['Cumulated'] += i['num_downloads'] else: returned_values[i['day']] = {'Cumulated': i['num_downloads']} library_names = ['Cumulated'] else: returned_values = {} for library_name in library_names: for i in datasets['pip'][library_name]: if i['day'] in returned_values: returned_values[i['day']][library_name] = i['num_downloads'] else: returned_values[i['day']] = {library_name: i['num_downloads']} for library_name in library_names: for i in returned_values.keys(): if library_name not in returned_values[i]: returned_values[i][library_name] = None returned_values = collections.OrderedDict(sorted(returned_values.items())) output = {l: [k[l] for k in returned_values.values()] for l in library_names} output['day'] = list(returned_values.keys()) return self.response(output) if self.path.startswith("/retrieveStars"): library_names, options = parse_name_and_options(self.path) returned_values = {} dataset_dict = datasets['stars'] week_over_week = '1' in options for library_name in library_names: dataset = dataset_dict[library_name] last_value = 0 last_week = dataset[0]['week'] for i in dataset: if week_over_week and last_week == i['week']: continue if i['dates'] in returned_values: returned_values[i['dates']][library_name] = i['range'] - last_value else: returned_values[i['dates']] = {library_name: i['range'] - last_value} last_value = i['range'] if week_over_week else 0 last_week = i['week'] returned_values = collections.OrderedDict(sorted(returned_values.items())) returned_values = link_values(library_names, returned_values) output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} output['day'] = list(returned_values.keys())[::-1] # Trim down to a smaller number of points. output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()} return self.response(output) if self.path.startswith("/retrieveIssues"): library_names, options = parse_name_and_options(self.path) exclude_org_members = '1' in options week_over_week = '2' in options returned_values = {} dataset_dict = datasets['issues'] range_id = 'range' if not exclude_org_members else 'range_non_org' for library_name in library_names: dataset = dataset_dict[library_name] last_value = 0 last_week = dataset[0]['week'] for i in dataset: if week_over_week and last_week == i['week']: continue if i['dates'] in returned_values: returned_values[i['dates']][library_name] = i[range_id] - last_value else: returned_values[i['dates']] = {library_name: i[range_id] - last_value} last_value = i[range_id] if week_over_week else 0 last_week = i['week'] returned_values = collections.OrderedDict(sorted(returned_values.items())) returned_values = link_values(library_names, returned_values) output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} output['day'] = list(returned_values.keys())[::-1] # Trim down to a smaller number of points. output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()} return self.response(output) return SimpleHTTPRequestHandler.do_GET(self) def response(self, output): self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(json.dumps(output).encode("utf-8")) return SimpleHTTPRequestHandler server = ThreadingHTTPServer(("", 7860), RequestHandler) print("Running on port 7860") server.serve_forever()