abdullahmeda commited on
Commit
2ac2136
1 Parent(s): 2b4c56a
Coursework_issuu_10.pdf ADDED
Binary file (106 kB). View file
 
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from cw2 import *
3
+
4
+ data_tiny = pd.read_json('datasets/sample_tiny.json', lines=True)
5
+ data_small = pd.read_json('datasets/sample_small.json', lines=True)
6
+ data_100k_lines = pd.read_json('datasets/sample_100k_lines.json', lines=True)
7
+ data_400k_lines = pd.read_json('datasets/sample_400k_lines.json', lines=True)
8
+
9
+ for dataset in [data_tiny, data_small, data_100k_lines, data_400k_lines]:
10
+ dataset['visitor_continent'] = dataset['visitor_country'].apply(country_code_to_continent)
11
+
12
+
13
+ def req_1(doc_uuid, data):
14
+ return get_views_by_country(globals()[f"data_{data}"], doc_uuid), \
15
+ get_views_by_continent(globals()[f"data_{data}"], doc_uuid)
16
+
17
+
18
+ def change(data):
19
+ return get_visitor_useragents(globals()[f"data_{data}"]), get_visitor_browsers(globals()[f"data_{data}"])
20
+
21
+
22
+ def req_4(data):
23
+ return get_avid_readers(globals()[f"data_{data}"]).reset_index().reset_index(), \
24
+ plot_avid_readers(globals()[f"data_{data}"])
25
+
26
+
27
+ def req_5(doc, vis, sort, data):
28
+ return also_like(globals()[f"data_{data}"], doc, vis, sort == "Ascending")
29
+
30
+
31
+ with gr.Blocks(css="""
32
+ #graph {width: 50%; margin: auto;}
33
+ .mx-auto {width: 70%; margin: auto;}
34
+ #space_between {justify-content: space-between}
35
+ #submit_button {background-color: red;}
36
+ """) as demo:
37
+ gr.Markdown("""
38
+ # Data Analysis of a Document Tracker
39
+ This assignment requires us to develop a simple Python-based application, that analyses and displays document tracking data from a major web site.
40
+
41
+ The issuu.com platform is a web site for publishing documents. It is widely used by many on-line publishers and currently hosts about 15 million documents. The web site tracks usage of the site and makes the resulting, anonymised data available to a wider audience. For example, it records who views a certain document, the browser used for viewing it, the way how the user arrived at this page etc. In this exercise, we use one of these data sets to perform data processing and analysis in Python.
42
+
43
+ The data format uses JSON and is described on this local page, describing the data spec. Note that the data files contain a sequence of entries in JSON format, rather than one huge JSON construct, in order to aid scalability
44
+
45
+ This project was built using python 3.10 and following are the implemented functionalities of all the required tasks. Tune the following inputs and click on "Visualize Data" to get the desired output:
46
+
47
+ ## Requirement 2: Views by country/continent
48
+ """)
49
+ with gr.Row():
50
+ with gr.Column(scale=1, variant='panel', elem_id="space_between"):
51
+ t2a_doc_uuid = gr.Textbox(value="140228101942-d4c9bd33cc299cc53d584ca1a4bf15d9", label="Enter document UUID:")
52
+ selected_dataset = gr.Radio(label="Choose dataset size:",
53
+ choices=['tiny', 'small', '100k_lines', '400k_lines'],
54
+ interactive=True,
55
+ value='small')
56
+ req_1_submit = gr.Button(value="Generate graphs", elem_id="submit_button")
57
+ with gr.Column(scale=2):
58
+ with gr.Row():
59
+ views_by_country = gr.Plot(value=get_views_by_country(globals()[f"data_{selected_dataset.value}"], t2a_doc_uuid.value))
60
+ views_by_continent = gr.Plot(value=get_views_by_continent(globals()[f"data_{selected_dataset.value}"], t2a_doc_uuid.value))
61
+
62
+ req_1_submit.click(fn=req_1, inputs=[t2a_doc_uuid, selected_dataset], outputs=[views_by_country, views_by_continent])
63
+
64
+ gr.Markdown("""
65
+ ## Requirement 3: Views by browser
66
+
67
+ We want to identify the most popular browser. To this end, the application has to examine the visitor useragent field and count the number of occurrences for each value in the input file.
68
+ """)
69
+ with gr.Column():
70
+ with gr.Row():
71
+ with gr.Column(scale=1, variant='panel', elem_id="space_between"):
72
+ selected_dataset_3 = gr.Radio(label="Choose dataset size:",
73
+ choices=['tiny', 'small', '100k_lines', '400k_lines'],
74
+ interactive=True,
75
+ value='small')
76
+ with gr.Row():
77
+ all_browsers = gr.Plot(value=get_visitor_useragents(data_tiny))
78
+ main_browsers = gr.Plot(value=get_visitor_browsers(data_tiny))
79
+
80
+ selected_dataset_3.change(fn=change, inputs=selected_dataset_3, outputs=[all_browsers, main_browsers])
81
+
82
+ gr.Markdown("""
83
+ ## Requirement 4: Reader profiles
84
+
85
+ We want to identify the most avid readers. We want to determine, for each user, the total time spent reading documents. The top 10 readers, based on this analysis, are printed below.:
86
+ """)
87
+
88
+ with gr.Column():
89
+ with gr.Row():
90
+ with gr.Column(scale=1, variant='panel', elem_id="space_between"):
91
+ selected_dataset_4 = gr.Radio(label="Choose dataset size:",
92
+ choices=['tiny', 'small', '100k_lines', '400k_lines'],
93
+ interactive=True,
94
+ value='small')
95
+ with gr.Row():
96
+ with gr.Column(scale=2):
97
+ avid_readers = gr.Dataframe(
98
+ headers=["#", "Visitor UUID", "Total Page Read Time"],
99
+ value=get_avid_readers(data_tiny).reset_index().reset_index(),
100
+ row_count=10,
101
+ col_count=3,
102
+ ),
103
+ with gr.Column(scale=3):
104
+ avid_readers_plot = gr.Plot(value=plot_avid_readers(data_tiny))
105
+
106
+ # selected_dataset_3.change(fn=req_4, inputs=selected_dataset_4, outputs=[avid_readers, avid_readers_plot])
107
+
108
+ gr.Markdown("""
109
+ ## Requirement 5: "Also likes" functionality
110
+
111
+ Popular document-hosting web sites, such as Amazon, provide information about related documents based on document tracking information. One such feature is the “also likes” functionality: for a given document, identify, which other documents have been read by this document’s readers.
112
+ """)
113
+ with gr.Row():
114
+ with gr.Column(scale=1, variant='panel', elem_id="space_between"):
115
+ selected_dataset_5 = gr.Radio(label="Choose dataset size:",
116
+ choices=['tiny', 'small', '100k_lines', '400k_lines'],
117
+ interactive=True,
118
+ value='small')
119
+ t5_doc_uuid = gr.Textbox(value="100713205147-2ee05a98f1794324952eea5ca678c026", label="Enter document UUID:")
120
+ t5_visitor_uuid = gr.Textbox(value="19c97695f06da354", label="Enter visitor UUID:")
121
+ t5_sorting = gr.Radio(choices=['Ascending', 'Descending'], value='Ascending', label="Specify the sorting function:")
122
+ req_5_submit = gr.Button(value="Submit", elem_id="submit_button")
123
+ with gr.Column(scale=1):
124
+ also_likes_df = gr.Dataframe(value=also_like(globals()[f"data_{selected_dataset_5.value}"], t5_doc_uuid.value, t5_visitor_uuid.value,
125
+ t5_sorting.value == 'Ascending'))
126
+
127
+ req_5_submit.click(fn=req_5, inputs=[t5_doc_uuid, t5_visitor_uuid, t5_sorting, selected_dataset_5], outputs=[also_likes_df])
128
+
129
+ gr.Markdown("""
130
+ ## Requirement 6: "Also likes" graph
131
+
132
+ For the above “also like” functionality, this section generates a graph that displays the relationship between the input document and all documents that have been found as “also like” documents (and only these documents)
133
+ """)
134
+ gr.Image('output.png')
135
+
136
+ demo.launch()
cw2.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import click
2
+ import pandas as pd
3
+ import graphviz
4
+ import pydot
5
+ import plotly.express as px
6
+ import matplotlib.pyplot as plt
7
+ import pycountry_convert as pc
8
+
9
+ d_tiny = pd.read_json('datasets/sample_tiny.json', lines=True)
10
+ d_small = pd.read_json('datasets/sample_small.json', lines=True)
11
+ d_100k_lines = pd.read_json('datasets/sample_100k_lines.json', lines=True)
12
+ d_400k_lines = pd.read_json('datasets/sample_400k_lines.json', lines=True)
13
+
14
+ def country_code_to_continent(country_code):
15
+ if country_code == "EU":
16
+ return "European Union"
17
+ elif country_code == "AP":
18
+ return "Asia/Pacific Region"
19
+ else:
20
+ try:
21
+ return pc.convert_continent_code_to_continent_name(pc.country_alpha2_to_continent_code(country_code))
22
+ except KeyError:
23
+ return "Unknown country"
24
+
25
+
26
+ def get_views_by_country(data, doc_uuid):
27
+ fig = plt.figure(figsize=(10, 6))
28
+ plt.title('Views by Country')
29
+ plt.xlabel("Countries")
30
+ plt.ylabel("No. of views")
31
+ plt.hist(data[data['subject_doc_id'] == doc_uuid]['visitor_country'])
32
+ return fig
33
+
34
+
35
+ def get_views_by_continent(data, doc_uuid):
36
+ fig = plt.figure(figsize=(10, 6))
37
+ plt.title('Views by Continent')
38
+ plt.xlabel("Continents")
39
+ plt.ylabel("No. of views")
40
+ plt.hist(data[data['subject_doc_id'] == doc_uuid]['visitor_continent'])
41
+ return fig
42
+
43
+
44
+ def get_visitor_useragents(data):
45
+ fig = plt.figure(figsize=(10, 6))
46
+ data['visitor_useragent'].value_counts().plot(kind='bar')
47
+ return fig
48
+
49
+
50
+ def get_visitor_browsers(data):
51
+ fig = plt.figure(figsize=(10, 6))
52
+ data['visitor_browser'] = data['visitor_useragent'].str.split('/').str[0]
53
+ data['visitor_browser'].value_counts().plot(kind='bar')
54
+ return fig
55
+
56
+
57
+ def get_avid_readers(data):
58
+ return data.groupby('visitor_uuid').sum(numeric_only=True)['event_readtime'].sort_values(ascending=False).head(10)
59
+
60
+
61
+ def plot_avid_readers(data):
62
+ fig = plt.figure(figsize=(11, 7))
63
+ get_avid_readers(data).sort_values(ascending=True).plot(kind='barh')
64
+ return fig
65
+
66
+
67
+ # def get_doc_visitors(doc_uuid):
68
+ # return data_tiny[data_tiny['subject_doc_id'] == doc_uuid]['visitor_uuid'].unique()
69
+ #
70
+ #
71
+ # def get_visitor_docs(visitor_uuid):
72
+ # return data_tiny[data_tiny['visitor_uuid'] == visitor_uuid]['subject_doc_id'].dropna().unique()
73
+ #
74
+ #
75
+ # def get_also_likes_doc(doc_uuid, visitor_uuid, ascending):
76
+ # visitor_uuids = get_doc_visitors(doc_uuid)
77
+ # doc_uuids = sum([list(get_visitor_docs(reader)) for reader in visitor_uuids], [])
78
+ # return pd.Series(doc_uuids).value_counts(ascending=ascending)
79
+ #
80
+ #
81
+ # def also_like(doc_uuid, visitor_uuid, ascending):
82
+ # visitor_uuids = get_doc_visitors(doc_uuid)
83
+ # y = []
84
+ # for reader in visitor_uuids:
85
+ # for doc in get_visitor_docs(reader):
86
+ # if doc != doc_uuid and doc not in get_visitor_docs(visitor_uuid).tolist():
87
+ # y.append([reader, doc])
88
+ # return pd.DataFrame(y)
89
+
90
+ # Req 5a
91
+ def get_doc_visitors(data, doc_uuid):
92
+ readers = data[data['env_type'] == "reader"]
93
+ return readers[readers['subject_doc_id'] == doc_uuid]['visitor_uuid'].unique()
94
+
95
+
96
+ # Req 5b
97
+ def get_visitor_docs(data, visitor_uuid):
98
+ readers = data[data['env_type'] == "reader"]
99
+ return readers[readers['visitor_uuid'] == visitor_uuid]['subject_doc_id'].dropna().unique()
100
+
101
+
102
+ # Req 5c, 5d
103
+ def also_like(data, doc_uuid, visitor_uuid, ascending, req_5=True):
104
+ visitor_uuids = get_doc_visitors(data, doc_uuid)
105
+ y = []
106
+ for reader in visitor_uuids:
107
+ for doc in get_visitor_docs(data, reader):
108
+ if doc != doc_uuid and doc not in get_visitor_docs(data, visitor_uuid).tolist():
109
+ y.append([reader, doc])
110
+ if not req_5:
111
+ return pd.DataFrame(y)
112
+ try:
113
+ if ascending:
114
+ print("Not Here")
115
+ return pd.DataFrame(y).groupby(1).count().nsmallest(10, [0]).reset_index().tail(-1)
116
+ else:
117
+ print("Here")
118
+ return pd.DataFrame(y).groupby(1).count().nlargest(10, [0]).reset_index().tail(-1)
119
+ except KeyError:
120
+ print("Exception thrown")
121
+ return pd.DataFrame(y)
122
+
123
+
124
+ # Req 6
125
+ def also_like_graph(data, doc_uuid, visitor_uuid, ascending):
126
+ try:
127
+ top_docs = also_like(data, doc_uuid, visitor_uuid, ascending, req_5=False)
128
+ y = top_docs[1].value_counts()
129
+ y = pd.DataFrame(y)
130
+ print(y)
131
+ dot = graphviz.Digraph()
132
+ for index, row in top_docs.iterrows():
133
+ # if(index <= 7):
134
+ # print(row[0], row[1])
135
+ dot.node(str(row[0]), str(row[0])[-4:])
136
+ dot.node(str(row[1]), str(row[1])[-4:])
137
+ dot.edge(str(row[0]), str(row[1]))
138
+
139
+ # print(dot.source)
140
+
141
+ dot.render('output.dot').replace('\\', '/')
142
+
143
+ (graph,) = pydot.graph_from_dot_file('output.dot')
144
+ graph.write_png('output.png')
145
+ return 'output.png'
146
+ except:
147
+ return "test.jpg"
148
+
149
+
150
+ # print(also_like(data_tiny, "100713205147-2ee05a98f1794324952eea5ca678c026", "b5d13a36dad1147b", False).groupby(1).count().nlargest(10, [0]))
151
+ # print(also_like("1s", "a", False).groupby(1).count())
152
+ #
153
+ @click.command()
154
+ @click.option('-u', type=str, help="user_uuid")
155
+ @click.option('-d', type=str, help="doc_uuid")
156
+ @click.option('-t', type=str, help="task_id")
157
+ @click.option('-f', type=str, help="file_name")
158
+ def run_task(u, d, t, f):
159
+
160
+ # 2a, 2b, 3a, 3b, 4, 5d, 6, 7
161
+ if t == "2a":
162
+ get_views_by_country(globals()[f"d_{f}"], d)
163
+ plt.show()
164
+ elif t == "2b":
165
+ get_views_by_continent(globals()[f"d_{f}"], d)
166
+ plt.show()
167
+ elif t == "3a":
168
+ get_visitor_useragents(globals()[f"d_{f}"])
169
+ plt.show()
170
+ elif t == "3b":
171
+ get_visitor_browsers(globals()[f"d_{f}"])
172
+ plt.show()
173
+ elif t == "4":
174
+ get_avid_readers(globals()[f"d_{f}"])
175
+ elif t == "5d":
176
+ # print(also_like('100713205147-2ee05a98f1794324952eea5ca678c026', '19c97695f06da354', False))
177
+ also_like(globals()[f"d_{f}"], d, u, False)
178
+ elif t == "6":
179
+ also_like_graph(globals()[f"d_{f}"], d, u, False)
180
+
181
+
182
+ if __name__ == '__main__':
183
+ run_task()
184
+
datasets/sample_small.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/sample_tiny.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/test_small.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"visitor_uuid": "a","subject_doc_id": "1s"}
2
+ {"visitor_uuid": "b","subject_doc_id": "1s"}
3
+ {"visitor_uuid": "b","subject_doc_id": "5s"}
4
+ {"visitor_uuid": "c","subject_doc_id": "1s"}
5
+ {"visitor_uuid": "d","subject_doc_id": "1s"}
6
+ {"visitor_uuid": "c","subject_doc_id": "2s"}
7
+ {"visitor_uuid": "c","subject_doc_id": "3s"}
8
+ {"visitor_uuid": "c","subject_doc_id": "4s"}
9
+ {"visitor_uuid": "c","subject_doc_id": "5s"}
10
+ {"visitor_uuid": "d","subject_doc_id": "5s"}
11
+ {"visitor_uuid": "d","subject_doc_id": "6s"}
12
+ {"visitor_uuid": "d","subject_doc_id": "7s"}
13
+ {"visitor_uuid": "d","subject_doc_id": "2s"}
14
+ {"visitor_uuid": "a","subject_doc_id": "4s"}
15
+ {"visitor_uuid": "e","subject_doc_id": "5s"}
16
+ {"visitor_uuid": "e","subject_doc_id": "8s"}
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ click==8.1.3
2
+ gradio==3.12.0
3
+ graphviz==0.20.1
4
+ matplotlib==3.6.2
5
+ pandas==1.5.1
6
+ plotly==5.11.0
7
+ pycountry_convert==0.7.2
8
+ pydot==1.4.2