File size: 1,983 Bytes
55ee8cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb6da00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import requests

def clean_up_tags(tags_list):
    tags_cleaned = []
    for tag in tags_list: 
        if ':' in tag:
            tag = tag.split(':')[1]
        
        tags_cleaned.append(tag)

    return ", ".join(tags_cleaned)



def check_api_url(url):
    """
    This function checks to see if "api" is present in the URL between ".co" and "/datasets". If not, it inserts "api" in the correct position.
    
    Args:
    url (str): A URL string
    
    Returns:
    str: A URL string with "api" inserted if necessary
    """
    # Split the URL into three parts based on the location of ".co" and "/datasets"
    parts = url.split(".co")
    first_part = parts[0] + ".co"
    last_part = parts[1]
    last_parts = last_part.split("/datasets")
    middle_part = ""
    if len(last_parts) > 1 and "/api" not in last_parts[0]:
        middle_part = "/api"
    # Concatenate the three parts to form the final URL
    new_url = first_part + middle_part + last_parts[0] + "/datasets" + last_parts[1]
    return new_url



def get_dataset_metadata(dataset_url):
    retrieved_metadata = {}
    dataset_url = check_api_url(dataset_url)
    keys_to_retrieve = ['id','description', 'tags']
    response = requests.get(dataset_url)
    if response.status_code == 200:
        response_json = response.json()
        for key in keys_to_retrieve:
            if key in response_json:
                retrieved_metadata[key] = response_json[key]

    return retrieved_metadata


def get_dataset_readme(dataset_url):
    retrieved_metadata = {}
    metadata_url = check_api_url(dataset_url)
    readme_url = dataset_url + '/raw/main/README.md'
    readme_response = requests.get(readme_url)
    metadata_response = requests.get(metadata_url)
    if readme_response.status_code == 200:
        response_text = readme_response.text
        dataset_id = metadata_response.json()['id']
        retrieved_metadata = {'id': dataset_id, 'README': response_text}

    return retrieved_metadata