Dr.Analytics / cw2.py
abdullahmeda's picture
.
26aab38
raw
history blame
6.14 kB
import click
import pandas as pd
import graphviz
import pydot
import plotly.express as px
import matplotlib.pyplot as plt
import pycountry_convert as pc
d_tiny = pd.read_json('datasets/sample_tiny.json', lines=True)
d_small = pd.read_json('datasets/sample_small.json', lines=True)
# d_100k_lines = pd.read_json('datasets/sample_100k_lines.json', lines=True)
# d_400k_lines = pd.read_json('datasets/sample_400k_lines.json', lines=True)
def country_code_to_continent(country_code):
if country_code == "EU":
return "European Union"
elif country_code == "AP":
return "Asia/Pacific Region"
else:
try:
return pc.convert_continent_code_to_continent_name(pc.country_alpha2_to_continent_code(country_code))
except KeyError:
return "Unknown country"
def get_views_by_country(data, doc_uuid):
fig = plt.figure(figsize=(10, 6))
plt.title('Views by Country')
plt.xlabel("Countries")
plt.ylabel("No. of views")
plt.hist(data[data['subject_doc_id'] == doc_uuid]['visitor_country'])
return fig
def get_views_by_continent(data, doc_uuid):
fig = plt.figure(figsize=(10, 6))
plt.title('Views by Continent')
plt.xlabel("Continents")
plt.ylabel("No. of views")
plt.hist(data[data['subject_doc_id'] == doc_uuid]['visitor_continent'])
return fig
def get_visitor_useragents(data):
fig = plt.figure(figsize=(10, 6))
data['visitor_useragent'].value_counts().plot(kind='bar')
return fig
def get_visitor_browsers(data):
fig = plt.figure(figsize=(10, 6))
data['visitor_browser'] = data['visitor_useragent'].str.split('/').str[0]
data['visitor_browser'].value_counts().plot(kind='bar')
return fig
def get_avid_readers(data):
return data.groupby('visitor_uuid').sum(numeric_only=True)['event_readtime'].sort_values(ascending=False).head(10)
def plot_avid_readers(data):
fig = plt.figure(figsize=(11, 7))
get_avid_readers(data).sort_values(ascending=True).plot(kind='barh')
return fig
# def get_doc_visitors(doc_uuid):
# return data_tiny[data_tiny['subject_doc_id'] == doc_uuid]['visitor_uuid'].unique()
#
#
# def get_visitor_docs(visitor_uuid):
# return data_tiny[data_tiny['visitor_uuid'] == visitor_uuid]['subject_doc_id'].dropna().unique()
#
#
# def get_also_likes_doc(doc_uuid, visitor_uuid, ascending):
# visitor_uuids = get_doc_visitors(doc_uuid)
# doc_uuids = sum([list(get_visitor_docs(reader)) for reader in visitor_uuids], [])
# return pd.Series(doc_uuids).value_counts(ascending=ascending)
#
#
# def also_like(doc_uuid, visitor_uuid, ascending):
# visitor_uuids = get_doc_visitors(doc_uuid)
# y = []
# for reader in visitor_uuids:
# for doc in get_visitor_docs(reader):
# if doc != doc_uuid and doc not in get_visitor_docs(visitor_uuid).tolist():
# y.append([reader, doc])
# return pd.DataFrame(y)
# Req 5a
def get_doc_visitors(data, doc_uuid):
readers = data[data['env_type'] == "reader"]
return readers[readers['subject_doc_id'] == doc_uuid]['visitor_uuid'].unique()
# Req 5b
def get_visitor_docs(data, visitor_uuid):
readers = data[data['env_type'] == "reader"]
return readers[readers['visitor_uuid'] == visitor_uuid]['subject_doc_id'].dropna().unique()
# Req 5c, 5d
def also_like(data, doc_uuid, visitor_uuid, ascending, req_5=True):
visitor_uuids = get_doc_visitors(data, doc_uuid)
y = []
for reader in visitor_uuids:
for doc in get_visitor_docs(data, reader):
if doc != doc_uuid and doc not in get_visitor_docs(data, visitor_uuid).tolist():
y.append([reader, doc])
if not req_5:
return pd.DataFrame(y)
try:
if ascending:
print("Not Here")
return pd.DataFrame(y).groupby(1).count().nsmallest(10, [0]).reset_index().tail(-1)
else:
print("Here")
return pd.DataFrame(y).groupby(1).count().nlargest(10, [0]).reset_index().tail(-1)
except KeyError:
print("Exception thrown")
return pd.DataFrame(y)
# Req 6
def also_like_graph(data, doc_uuid, visitor_uuid, ascending):
try:
top_docs = also_like(data, doc_uuid, visitor_uuid, ascending, req_5=False)
y = top_docs[1].value_counts()
y = pd.DataFrame(y)
y.loc[len(y.index)] = [visitor_uuid, doc_uuid]
print(y)
dot = graphviz.Digraph()
for index, row in top_docs.iterrows():
# if(index <= 7):
# print(row[0], row[1])
dot.node(str(row[0]), str(row[0])[-4:])
dot.node(str(row[1]), str(row[1])[-4:])
dot.edge(str(row[0]), str(row[1]))
# print(dot.source)
dot.render('output.dot').replace('\\', '/')
(graph,) = pydot.graph_from_dot_file('output.dot')
graph.write_png('output.png')
return 'output.png'
except:
return "output_test.jpg"
# print(also_like(data_tiny, "100713205147-2ee05a98f1794324952eea5ca678c026", "b5d13a36dad1147b", False).groupby(1).count().nlargest(10, [0]))
# print(also_like("1s", "a", False).groupby(1).count())
#
@click.command()
@click.option('-u', type=str, help="user_uuid")
@click.option('-d', type=str, help="doc_uuid")
@click.option('-t', type=str, help="task_id")
@click.option('-f', type=str, help="file_name")
def run_task(u, d, t, f):
# 2a, 2b, 3a, 3b, 4, 5d, 6, 7
if t == "2a":
get_views_by_country(globals()[f"d_{f}"], d)
plt.show()
elif t == "2b":
get_views_by_continent(globals()[f"d_{f}"], d)
plt.show()
elif t == "3a":
get_visitor_useragents(globals()[f"d_{f}"])
plt.show()
elif t == "3b":
get_visitor_browsers(globals()[f"d_{f}"])
plt.show()
elif t == "4":
get_avid_readers(globals()[f"d_{f}"])
elif t == "5d":
# print(also_like('100713205147-2ee05a98f1794324952eea5ca678c026', '19c97695f06da354', False))
also_like(globals()[f"d_{f}"], d, u, False)
elif t == "6":
also_like_graph(globals()[f"d_{f}"], d, u, False)
if __name__ == '__main__':
run_task()