Spaces:

abd-meda
/

Dr.Analytics

Running

App Files Files Community

abdullahmeda commited on Dec 9, 2022

Commit

2ac2136

•

1 Parent(s): 2b4c56a

init

Browse files

Files changed (7) hide show

Coursework_issuu_10.pdf +0 -0
app.py +136 -0
cw2.py +184 -0
datasets/sample_small.json +0 -0
datasets/sample_tiny.json +0 -0
datasets/test_small.json +16 -0
requirements.txt +8 -0

Coursework_issuu_10.pdf ADDED Viewed

Binary file (106 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import gradio as gr
+from cw2 import *
+data_tiny = pd.read_json('datasets/sample_tiny.json', lines=True)
+data_small = pd.read_json('datasets/sample_small.json', lines=True)
+data_100k_lines = pd.read_json('datasets/sample_100k_lines.json', lines=True)
+data_400k_lines = pd.read_json('datasets/sample_400k_lines.json', lines=True)
+for dataset in [data_tiny, data_small, data_100k_lines, data_400k_lines]:
+    dataset['visitor_continent'] = dataset['visitor_country'].apply(country_code_to_continent)
+def req_1(doc_uuid, data):
+    return get_views_by_country(globals()[f"data_{data}"], doc_uuid), \
+           get_views_by_continent(globals()[f"data_{data}"], doc_uuid)
+def change(data):
+    return get_visitor_useragents(globals()[f"data_{data}"]), get_visitor_browsers(globals()[f"data_{data}"])
+def req_4(data):
+    return get_avid_readers(globals()[f"data_{data}"]).reset_index().reset_index(), \
+           plot_avid_readers(globals()[f"data_{data}"])
+def req_5(doc, vis, sort, data):
+    return also_like(globals()[f"data_{data}"], doc, vis, sort == "Ascending")
+with gr.Blocks(css="""
+                    #graph {width: 50%; margin: auto;}
+                    .mx-auto {width: 70%; margin: auto;}
+                    #space_between {justify-content: space-between}
+                    #submit_button {background-color: red;}
+                """) as demo:
+    gr.Markdown("""
+        # Data Analysis of a Document Tracker
+        This assignment requires us to develop a simple Python-based application, that analyses and displays document tracking data from a major web site.
+        The issuu.com platform is a web site for publishing documents. It is widely used by many on-line publishers and currently hosts about 15 million documents. The web site tracks usage of the site and makes the resulting, anonymised data available to a wider audience. For example, it records who views a certain document, the browser used for viewing it, the way how the user arrived at this page etc. In this exercise, we use one of these data sets to perform data processing and analysis in Python.
+        The data format uses JSON and is described on this local page, describing the data spec. Note that the data files contain a sequence of entries in JSON format, rather than one huge JSON construct, in order to aid scalability
+        This project was built using python 3.10 and following are the implemented functionalities of all the required tasks. Tune the following inputs and click on "Visualize Data" to get the desired output:
+        ## Requirement 2: Views by country/continent
+    """)
+    with gr.Row():
+        with gr.Column(scale=1, variant='panel', elem_id="space_between"):
+            t2a_doc_uuid = gr.Textbox(value="140228101942-d4c9bd33cc299cc53d584ca1a4bf15d9", label="Enter document UUID:")
+            selected_dataset = gr.Radio(label="Choose dataset size:",
+                                        choices=['tiny', 'small', '100k_lines', '400k_lines'],
+                                        interactive=True,
+                                        value='small')
+            req_1_submit = gr.Button(value="Generate graphs", elem_id="submit_button")
+    with gr.Column(scale=2):
+        with gr.Row():
+            views_by_country = gr.Plot(value=get_views_by_country(globals()[f"data_{selected_dataset.value}"], t2a_doc_uuid.value))
+            views_by_continent = gr.Plot(value=get_views_by_continent(globals()[f"data_{selected_dataset.value}"], t2a_doc_uuid.value))
+    req_1_submit.click(fn=req_1, inputs=[t2a_doc_uuid, selected_dataset], outputs=[views_by_country, views_by_continent])
+    gr.Markdown("""
+        ## Requirement 3: Views by browser
+        We want to identify the most popular browser. To this end, the application has to examine the visitor useragent field and count the number of occurrences for each value in the input file.
+    """)
+    with gr.Column():
+        with gr.Row():
+            with gr.Column(scale=1, variant='panel', elem_id="space_between"):
+                selected_dataset_3 = gr.Radio(label="Choose dataset size:",
+                                              choices=['tiny', 'small', '100k_lines', '400k_lines'],
+                                              interactive=True,
+                                              value='small')
+        with gr.Row():
+            all_browsers = gr.Plot(value=get_visitor_useragents(data_tiny))
+            main_browsers = gr.Plot(value=get_visitor_browsers(data_tiny))
+        selected_dataset_3.change(fn=change, inputs=selected_dataset_3, outputs=[all_browsers, main_browsers])
+    gr.Markdown("""
+        ## Requirement 4: Reader profiles
+        We want to identify the most avid readers. We want to determine, for each user, the total time spent reading documents. The top 10 readers, based on this analysis, are printed below.:
+        """)
+    with gr.Column():
+        with gr.Row():
+            with gr.Column(scale=1, variant='panel', elem_id="space_between"):
+                selected_dataset_4 = gr.Radio(label="Choose dataset size:",
+                                              choices=['tiny', 'small', '100k_lines', '400k_lines'],
+                                              interactive=True,
+                                              value='small')
+        with gr.Row():
+            with gr.Column(scale=2):
+                avid_readers = gr.Dataframe(
+                    headers=["#", "Visitor UUID", "Total Page Read Time"],
+                    value=get_avid_readers(data_tiny).reset_index().reset_index(),
+                    row_count=10,
+                    col_count=3,
+                ),
+            with gr.Column(scale=3):
+                avid_readers_plot = gr.Plot(value=plot_avid_readers(data_tiny))
+        # selected_dataset_3.change(fn=req_4, inputs=selected_dataset_4, outputs=[avid_readers, avid_readers_plot])
+    gr.Markdown("""
+        ## Requirement 5: "Also likes" functionality
+        Popular document-hosting web sites, such as Amazon, provide information about related documents based on document tracking information. One such feature is the “also likes” functionality: for a given document, identify, which other documents have been read by this document’s readers.
+        """)
+    with gr.Row():
+        with gr.Column(scale=1, variant='panel', elem_id="space_between"):
+            selected_dataset_5 = gr.Radio(label="Choose dataset size:",
+                                          choices=['tiny', 'small', '100k_lines', '400k_lines'],
+                                          interactive=True,
+                                          value='small')
+            t5_doc_uuid = gr.Textbox(value="100713205147-2ee05a98f1794324952eea5ca678c026", label="Enter document UUID:")
+            t5_visitor_uuid = gr.Textbox(value="19c97695f06da354", label="Enter visitor UUID:")
+            t5_sorting = gr.Radio(choices=['Ascending', 'Descending'], value='Ascending', label="Specify the sorting function:")
+            req_5_submit = gr.Button(value="Submit", elem_id="submit_button")
+        with gr.Column(scale=1):
+            also_likes_df = gr.Dataframe(value=also_like(globals()[f"data_{selected_dataset_5.value}"], t5_doc_uuid.value, t5_visitor_uuid.value,
+                                                         t5_sorting.value == 'Ascending'))
+        req_5_submit.click(fn=req_5, inputs=[t5_doc_uuid, t5_visitor_uuid, t5_sorting, selected_dataset_5], outputs=[also_likes_df])
+    gr.Markdown("""
+        ## Requirement 6: "Also likes" graph
+        For the above “also like” functionality, this section generates a graph that displays the relationship between the input document and all documents that have been found as “also like” documents (and only these documents)
+        """)
+    gr.Image('output.png')
+demo.launch()

cw2.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import click
+import pandas as pd
+import graphviz
+import pydot
+import plotly.express as px
+import matplotlib.pyplot as plt
+import pycountry_convert as pc
+d_tiny = pd.read_json('datasets/sample_tiny.json', lines=True)
+d_small = pd.read_json('datasets/sample_small.json', lines=True)
+d_100k_lines = pd.read_json('datasets/sample_100k_lines.json', lines=True)
+d_400k_lines = pd.read_json('datasets/sample_400k_lines.json', lines=True)
+def country_code_to_continent(country_code):
+    if country_code == "EU":
+        return "European Union"
+    elif country_code == "AP":
+        return "Asia/Pacific Region"
+    else:
+        try:
+            return pc.convert_continent_code_to_continent_name(pc.country_alpha2_to_continent_code(country_code))
+        except KeyError:
+            return "Unknown country"
+def get_views_by_country(data, doc_uuid):
+    fig = plt.figure(figsize=(10, 6))
+    plt.title('Views by Country')
+    plt.xlabel("Countries")
+    plt.ylabel("No. of views")
+    plt.hist(data[data['subject_doc_id'] == doc_uuid]['visitor_country'])
+    return fig
+def get_views_by_continent(data, doc_uuid):
+    fig = plt.figure(figsize=(10, 6))
+    plt.title('Views by Continent')
+    plt.xlabel("Continents")
+    plt.ylabel("No. of views")
+    plt.hist(data[data['subject_doc_id'] == doc_uuid]['visitor_continent'])
+    return fig
+def get_visitor_useragents(data):
+    fig = plt.figure(figsize=(10, 6))
+    data['visitor_useragent'].value_counts().plot(kind='bar')
+    return fig
+def get_visitor_browsers(data):
+    fig = plt.figure(figsize=(10, 6))
+    data['visitor_browser'] = data['visitor_useragent'].str.split('/').str[0]
+    data['visitor_browser'].value_counts().plot(kind='bar')
+    return fig
+def get_avid_readers(data):
+    return data.groupby('visitor_uuid').sum(numeric_only=True)['event_readtime'].sort_values(ascending=False).head(10)
+def plot_avid_readers(data):
+    fig = plt.figure(figsize=(11, 7))
+    get_avid_readers(data).sort_values(ascending=True).plot(kind='barh')
+    return fig
+# def get_doc_visitors(doc_uuid):
+#     return data_tiny[data_tiny['subject_doc_id'] == doc_uuid]['visitor_uuid'].unique()
+#
+#
+# def get_visitor_docs(visitor_uuid):
+#     return data_tiny[data_tiny['visitor_uuid'] == visitor_uuid]['subject_doc_id'].dropna().unique()
+#
+#
+# def get_also_likes_doc(doc_uuid, visitor_uuid, ascending):
+#     visitor_uuids = get_doc_visitors(doc_uuid)
+#     doc_uuids = sum([list(get_visitor_docs(reader)) for reader in visitor_uuids], [])
+#     return pd.Series(doc_uuids).value_counts(ascending=ascending)
+#
+#
+# def also_like(doc_uuid, visitor_uuid, ascending):
+#     visitor_uuids = get_doc_visitors(doc_uuid)
+#     y = []
+#     for reader in visitor_uuids:
+#         for doc in get_visitor_docs(reader):
+#             if doc != doc_uuid and doc not in get_visitor_docs(visitor_uuid).tolist():
+#                 y.append([reader, doc])
+#     return pd.DataFrame(y)
+# Req 5a
+def get_doc_visitors(data, doc_uuid):
+    readers = data[data['env_type'] == "reader"]
+    return readers[readers['subject_doc_id'] == doc_uuid]['visitor_uuid'].unique()
+# Req 5b
+def get_visitor_docs(data, visitor_uuid):
+    readers = data[data['env_type'] == "reader"]
+    return readers[readers['visitor_uuid'] == visitor_uuid]['subject_doc_id'].dropna().unique()
+# Req 5c, 5d
+def also_like(data, doc_uuid, visitor_uuid, ascending, req_5=True):
+    visitor_uuids = get_doc_visitors(data, doc_uuid)
+    y = []
+    for reader in visitor_uuids:
+        for doc in get_visitor_docs(data, reader):
+            if doc != doc_uuid and doc not in get_visitor_docs(data, visitor_uuid).tolist():
+                y.append([reader, doc])
+    if not req_5:
+        return pd.DataFrame(y)
+    try:
+        if ascending:
+            print("Not Here")
+            return pd.DataFrame(y).groupby(1).count().nsmallest(10, [0]).reset_index().tail(-1)
+        else:
+            print("Here")
+            return pd.DataFrame(y).groupby(1).count().nlargest(10, [0]).reset_index().tail(-1)
+    except KeyError:
+        print("Exception thrown")
+        return pd.DataFrame(y)
+# Req 6
+def also_like_graph(data, doc_uuid, visitor_uuid, ascending):
+    try:
+        top_docs = also_like(data, doc_uuid, visitor_uuid, ascending, req_5=False)
+        y = top_docs[1].value_counts()
+        y = pd.DataFrame(y)
+        print(y)
+        dot = graphviz.Digraph()
+        for index, row in top_docs.iterrows():
+            # if(index <= 7):
+            # print(row[0], row[1])
+            dot.node(str(row[0]), str(row[0])[-4:])
+            dot.node(str(row[1]), str(row[1])[-4:])
+            dot.edge(str(row[0]), str(row[1]))
+        # print(dot.source)
+        dot.render('output.dot').replace('\\', '/')
+        (graph,) = pydot.graph_from_dot_file('output.dot')
+        graph.write_png('output.png')
+        return 'output.png'
+    except:
+        return "test.jpg"
+# print(also_like(data_tiny, "100713205147-2ee05a98f1794324952eea5ca678c026", "b5d13a36dad1147b", False).groupby(1).count().nlargest(10, [0]))
+# print(also_like("1s", "a", False).groupby(1).count())
+#
+@click.command()
+@click.option('-u', type=str, help="user_uuid")
+@click.option('-d', type=str, help="doc_uuid")
+@click.option('-t', type=str, help="task_id")
+@click.option('-f', type=str, help="file_name")
+def run_task(u, d, t, f):
+    # 2a, 2b, 3a, 3b, 4, 5d, 6, 7
+    if t == "2a":
+        get_views_by_country(globals()[f"d_{f}"], d)
+        plt.show()
+    elif t == "2b":
+        get_views_by_continent(globals()[f"d_{f}"], d)
+        plt.show()
+    elif t == "3a":
+        get_visitor_useragents(globals()[f"d_{f}"])
+        plt.show()
+    elif t == "3b":
+        get_visitor_browsers(globals()[f"d_{f}"])
+        plt.show()
+    elif t == "4":
+        get_avid_readers(globals()[f"d_{f}"])
+    elif t == "5d":
+        # print(also_like('100713205147-2ee05a98f1794324952eea5ca678c026', '19c97695f06da354', False))
+        also_like(globals()[f"d_{f}"], d, u, False)
+    elif t == "6":
+        also_like_graph(globals()[f"d_{f}"], d, u, False)
+if __name__ == '__main__':
+    run_task()

datasets/sample_small.json ADDED Viewed

The diff for this file is too large to render. See raw diff

datasets/sample_tiny.json ADDED Viewed

The diff for this file is too large to render. See raw diff

datasets/test_small.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{"visitor_uuid": "a","subject_doc_id": "1s"}
+{"visitor_uuid": "b","subject_doc_id": "1s"}
+{"visitor_uuid": "b","subject_doc_id": "5s"}
+{"visitor_uuid": "c","subject_doc_id": "1s"}
+{"visitor_uuid": "d","subject_doc_id": "1s"}
+{"visitor_uuid": "c","subject_doc_id": "2s"}
+{"visitor_uuid": "c","subject_doc_id": "3s"}
+{"visitor_uuid": "c","subject_doc_id": "4s"}
+{"visitor_uuid": "c","subject_doc_id": "5s"}
+{"visitor_uuid": "d","subject_doc_id": "5s"}
+{"visitor_uuid": "d","subject_doc_id": "6s"}
+{"visitor_uuid": "d","subject_doc_id": "7s"}
+{"visitor_uuid": "d","subject_doc_id": "2s"}
+{"visitor_uuid": "a","subject_doc_id": "4s"}
+{"visitor_uuid": "e","subject_doc_id": "5s"}
+{"visitor_uuid": "e","subject_doc_id": "8s"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+click==8.1.3
+gradio==3.12.0
+graphviz==0.20.1
+matplotlib==3.6.2
+pandas==1.5.1
+plotly==5.11.0
+pycountry_convert==0.7.2
+pydot==1.4.2