[ [ "Number of Document of Each Topic", { "type": "pie", "kwargs": { "x": [ 324853053, 127033069, 233531055, 123094708, 267497100, 148588074, 581871647, 165387460, 390492627, 244588996, 170281196, 914696921, 281274506, 686870899, 19458015, 1411221902, 366116749 ], "labels": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "autopct": "%1.1f%%", "colors": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ], "pctdistance": 1.2, "labeldistance": 1.5 }, "comment": "As shown in the graph above, over 20% of the documents are related to Business & Economics & Finance, which makes it the largest topic group in dataset. On the contrary, the group of Culture & Cultural geography contains the smallest number of documents among all topics." } ], [ "Fraction of Words Corrected in Lines", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.005660839019938058, 0.005601641737204916, 0.010656858389603374, 0.006108459524594901, 0.005077341851036456, 0.004333818728677237, 0.00812686384284095, 0.005099914065389049, 0.005922873834475705, 0.008028764588273587, 0.005868815973653353, 0.007446294346393395, 0.006845364607248323, 0.007812665071102337, 0.007692180748283549, 0.006834288663313659, 0.007850315335340054 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "In average, documents related to Shopping & Commodity have larger fraction of words corrected in lines." } ], [ "Fraction of Lines Ending with Ellipsis", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.013698353704877283, 0.011988367184873385, 0.010239788510367555, 0.013182844498032174, 0.012825014289657984, 0.016784713501187303, 0.013729740175749594, 0.012272497721678627, 0.011805768817329271, 0.013464839491767208, 0.012785021526251267, 0.015677345947523093, 0.011127706885026923, 0.012810749078485683, 0.013244961193298873, 0.012872493046687979, 0.014188113777531883 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Compared with other topics, Personal Development & Human Resources & Career in average contain more lines ending with ellipsis." } ], [ "Fraction of Lines Starting with Bullet Point", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.05924759002174845, 0.06636489569195865, 0.1111156447572103, 0.057900707172324956, 0.05498350949228654, 0.04950217629831486, 0.09247477225558454, 0.06597399742617387, 0.08548870827846955, 0.09873316891194645, 0.06547543788491705, 0.0735152711822082, 0.08847503034590092, 0.07390893089349196, 0.058802087892367495, 0.08333351946410401, 0.06067125030474924 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Shopping & Commodity related documents have higher percentage of lines starting with bullet point." } ], [ "Number of Lines with Toxic Words", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.19922507238988454, 0.8737279739340943, 0.2723972492651994, 0.2602153376081773, 0.26610157268994694, 1.0438407391968751, 0.13075538461491662, 0.43004878362603793, 0.7763741362522576, 0.15141952256920013, 0.1365380766999076, 0.7216673095153012, 0.10996786534219351, 1.0588212632953606, 0.13198319561373553, 0.1422862362860352, 0.45226715918424154 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Daily Life & Home & Lifestyle in average has more lines with toxic words." } ], [ "Number of Toxic Words", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.2733295968131166, 1.5669363935464709, 0.44824271872535326, 0.372924147153426, 0.3350402266043258, 2.0669407694186814, 0.19540095755860054, 0.5992540667835397, 1.458653655962626, 0.1993768722121906, 0.18759733165134687, 1.4411351199901983, 0.1523272713524915, 2.455736465842033, 0.18513111435056454, 0.22537730852195914, 0.781900764665645 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Daily Life & Home & Lifestyle in average has more toxic words." } ], [ "Word Count", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 528.940000185253, 630.8575219496587, 331.12940931560473, 652.5414443568118, 639.8122070257958, 745.046993327338, 616.6008528251936, 561.0260046983005, 739.9628602078574, 427.55716142683707, 611.8977898886733, 470.5665158383101, 557.2392759050832, 448.4545774765747, 666.4168803960733, 555.4812271358867, 506.7156364621822 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Documents in the topic of Personal Development & Human Resources & Career in average contain more words than other topics." } ], [ "Mean Word Length", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 4.8591533713129555, 5.180746132747496, 4.95975994177285, 4.875042818402709, 5.168670579970495, 4.654983410185081, 5.237515458154388, 5.1004147156966715, 5.205703704499496, 4.880593401877592, 5.08581294318828, 4.914944728270949, 5.264151733240911, 4.9967250103431935, 5.143653278714547, 5.172399304913307, 4.948735274753513 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "There is no significant variance in the average word length for different topic groups. However, Education related data contain longer words than others in general." } ], [ "Number of Sentences", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 23.889495217396032, 28.667429824906456, 17.00780648209721, 32.56295546840243, 27.016352794105057, 41.47359763206837, 27.639144560346658, 24.82906568611671, 34.94192956170719, 22.905121737365487, 28.89842860864097, 22.790945701674666, 26.620081949410658, 22.43558780468875, 30.96235911011478, 25.786870409555195, 27.291027835495175 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Documents in the topic of Personal Development & Human Resources & Career usually contain more sentences." } ], [ "Symbol to Word Ratio", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.0029673437245102876, 0.002369316991198444, 0.0026953845515368074, 0.0030995856207761256, 0.002515345366978788, 0.003716508288521279, 0.002910243583180489, 0.0021063347433407133, 0.0023350751882016177, 0.00325952936332765, 0.0026651973287582483, 0.0037352572697365097, 0.002278588397824893, 0.0033945285429091187, 0.002321581720070917, 0.002557382224711868, 0.0035588078008559885 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Documents related to Entertainment & Travel & Hobby usually have higher percentage of symbols." } ], [ "Fraction of Words with Alpha Character", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.9543530503658745, 0.966243732434154, 0.9437721848599528, 0.9641198106485631, 0.9655064629815291, 0.9789937421507563, 0.9480065698252734, 0.9637242361370412, 0.9640004505795688, 0.950377474345678, 0.9627294216362635, 0.9531905135921064, 0.9586824669836848, 0.9522644098234544, 0.9526614781429045, 0.9564103310368344, 0.9600895447178572 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "The fraction of words with alpha character seems to be relatively consistent across different topics." } ], [ "Number of Stop Words", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 107.18473168851517, 138.47942025237538, 61.60918090315654, 149.6609807547535, 142.7609030677342, 156.05911428665533, 121.44399604677766, 120.49043039901574, 151.07195976327614, 83.29154811200092, 130.22922543954883, 93.5245517591504, 114.12362959051823, 85.78707441498406, 158.3108779081525, 113.40855796468499, 104.68147387870529 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Culture & Cultural geography contains more stop words in average." } ], [ "Has Curly Bracket", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.005196103236253101, 0.006631383517940514, 0.010188400853154199, 0.00940792678106032, 0.011743005812025626, 0.009386123411223433, 0.027600317153793196, 0.0069009343271853865, 0.009196375940793372, 0.010313546566910966, 0.007981867827613802, 0.007428252838734548, 0.010558184039615734, 0.011781598276738173, 0.01233861727416697, 0.00927681818248878, 0.00859682603594844 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Natural Science & Formal Science & Technology has a significantly higher rate in percentage of documents that contain curly bracket. It might be related to the coding data." } ], [ "Number of Document Duplication", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 7.537686235012851, 7.1025928374603, 5.966075852310092, 7.145161033242794, 7.434088750868701, 6.676430767922868, 6.555241123477528, 7.261966584407307, 6.24676702026438, 6.911925072867955, 7.12188790945537, 6.103017919746556, 6.569012504105153, 5.991152694037777, 8.832861317045957, 6.079587144899626, 6.927007100677604 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Culture & Cultural geography related documents have a higher number of duplication count." } ], [ "Number of Dump Duplication", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 3.192987467475025, 3.256401260367881, 2.059270656744132, 3.2623413672665764, 3.4332213994095637, 3.184122448481296, 2.718956481479841, 3.2133479527407944, 2.675923289583647, 3.1140861341121004, 3.18235214298119, 2.64155309428444, 2.727249994708017, 2.398892457664013, 3.9119670737225767, 2.418408296500489, 3.0585330664563504 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "In average, Culture & Cultural geography related documents are duplicated across a higher number of common crawl dumps. Duplication of Shopping & Commodity appears in less dumps than others." } ], [ "Number of Year Duplication", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 1.5100191962794944, 1.5532559399946482, 1.286292351995755, 1.5752399851340482, 1.583747083613243, 1.5616323218510795, 1.4345138525025949, 1.5322301884314566, 1.4292428061644298, 1.514165600483515, 1.5310395693955543, 1.420375959699989, 1.4311913003590877, 1.3601090413935268, 1.6918824453573502, 1.372621704109578, 1.5093211072952033 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "In average, Culture & Cultural geography related documents are duplicated across more years than other topics." } ], [ "Maximum Span of Year Duplication", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 1.6027068922144314, 1.6463481331778262, 1.326032869589871, 1.6654945962421066, 1.7014688495688364, 1.6394770282842483, 1.5158579087803534, 1.622844253125358, 1.4877078793090759, 1.5888212812321287, 1.6172138231869126, 1.4887702131010017, 1.492519460686565, 1.41459098414941, 1.822498749230073, 1.4288667063218525, 1.592900066967436 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "In average, Culture & Cultural geography related documents are duplicated across a wider span of years." } ], [ "Language Score", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.940489806219048, 0.9341014072001546, 0.8833469805761418, 0.9326888501156927, 0.9414304493962583, 0.9514325652491805, 0.8825959914278214, 0.9474163424125213, 0.9228861253995115, 0.9051492112749342, 0.9259433469236898, 0.9106329146251756, 0.9205018098890236, 0.8984234924235204, 0.922120043098531, 0.9144863004649139, 0.9163656720680041 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Average language scores of different topic groups are mostly consistent. No significant differences are obeserved." } ], [ "Fraction of Duplicate Lines", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.012600997185367828, 0.01188115899050692, 0.019039766660027862, 0.0124898927764339, 0.011646428662052831, 0.010610017211082174, 0.0159476139009855, 0.012597314331886177, 0.015094734040349217, 0.014975673115722092, 0.012534733023571196, 0.01610487136667016, 0.015238474263765327, 0.01591887690664154, 0.01433554300372473, 0.015517507810570494, 0.015401894047378658 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "In average, Shopping & Commodity has a larger fraction of duplicate lines than others." } ], [ "Fraction of Characters in Duplicate Lines", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.004938858081287135, 0.004205415457276368, 0.007552357256944613, 0.004582746666516553, 0.0044753076683352235, 0.003940081675446834, 0.006179645047614322, 0.0050437770645133185, 0.005686946797304247, 0.005994693646977406, 0.0046510979989690445, 0.006342709242367984, 0.005829011670104205, 0.006381457735701225, 0.005068730018848793, 0.005971977053954138, 0.005856182489324971 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 }, "comment": "Shopping & Commodity usually has a larger fraction of characters in duplicate lines than others." } ], [ "Fraction of Characters in Most Common Bigram", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.026404273123764796, 0.025634991525195914, 0.03614706934891397, 0.026733992014165146, 0.02377814829063671, 0.019649114365205896, 0.03137691450183766, 0.0270495750357038, 0.027673178183087933, 0.029942339233414595, 0.027350265679715224, 0.030526314564882247, 0.030614040432541026, 0.03509742016691783, 0.027540083176404263, 0.029519105783701725, 0.028834560229748462 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 } } ], [ "Fraction of Characters in Most Common 3-gram", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.026165437832849362, 0.026251631875192687, 0.03811434473394529, 0.027814565987299922, 0.02458896408514931, 0.020185288853227328, 0.031209395373852387, 0.027345772022684685, 0.026970288190643604, 0.030974020712503342, 0.027787662286871063, 0.03143649261443422, 0.030952890587447934, 0.035409395984874435, 0.028486665111510972, 0.029371087024795153, 0.030651728333515618 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 } } ], [ "Fraction of Characters in Most Common 4-gram", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.026986818387229945, 0.027131471441621514, 0.04096520359337113, 0.02892589321318727, 0.025567532544329325, 0.02099809142740805, 0.03184294279840072, 0.027798282452682368, 0.027173562606014456, 0.03262575837410923, 0.028962796310066586, 0.03275942719001153, 0.03150508247840716, 0.03652360609065789, 0.029967601450189167, 0.029689817413511087, 0.032849409839897196 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 } } ], [ "Fraction of Characters in Duplicate 5-grams", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.017172412387336047, 0.018706184899018846, 0.020367380081370155, 0.01912555365329135, 0.017726236260087368, 0.014196179855798982, 0.026453126704962582, 0.02113702754713442, 0.022750991771259714, 0.017193015331520775, 0.018542560337896252, 0.019254844137973823, 0.022992720412462874, 0.02142410388811584, 0.019425070816460523, 0.021273316544081922, 0.016721728689196018 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 } } ], [ "Fraction of Characters in Duplicate 6-grams", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.011970605317388902, 0.013247085951653395, 0.014520268748996269, 0.013346559736076019, 0.012676990872510209, 0.009987587475557972, 0.019267682560495096, 0.014804574416538653, 0.01599622943881697, 0.012044805932442022, 0.013103480807140754, 0.013565938336254593, 0.015985684726478346, 0.014952398432378033, 0.014038548162484649, 0.01499523284606334, 0.01191912942692566 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 } } ], [ "Fraction of Characters in Duplicate 7-grams", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.009039855940169648, 0.010073325457247666, 0.010696329689399887, 0.009986471165253209, 0.009836605907258674, 0.007629540534323915, 0.014862665767002146, 0.011114675308487159, 0.011994499294618745, 0.00907677139620476, 0.009828884274472392, 0.010105882592285087, 0.011911397850057279, 0.011114070775684791, 0.010705940851157975, 0.011289206913404862, 0.009020791758628242 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 } } ], [ "Fraction of Characters in Duplicate 8-grams", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.007195970331280814, 0.008036785010913946, 0.008249465083499188, 0.007850453980445486, 0.007992494679173406, 0.006182227979778321, 0.011985128922160077, 0.008779317883993009, 0.009467708596743243, 0.0072104637314519765, 0.007673480063984403, 0.007904073310509803, 0.00934269422506397, 0.008657166799636231, 0.008485033120385031, 0.00893995298657962, 0.0070980585511939785 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 } } ], [ "Fraction of Characters in Duplicate 9-grams", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.006007801998269901, 0.0066553421544330435, 0.006529544209051376, 0.006431532192110704, 0.006749247730086534, 0.0052160144431644155, 0.009999607112669078, 0.007230897967718682, 0.007732210045591141, 0.005943279041721623, 0.006205408840055294, 0.006469113514028088, 0.007626168747361047, 0.006984803950948357, 0.006992627523875565, 0.007390774121782952, 0.005766236221861412 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 } } ], [ "Fraction of Characters in Duplicate 10-grams", { "type": "barh", "kwargs": { "y": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "width": [ 0.0051710981353331464, 0.005682713875756294, 0.005363391346741175, 0.005464661863661183, 0.005846752796603754, 0.0045485380381742845, 0.00859601801316329, 0.006136039855302813, 0.0065548495409889435, 0.005091836417990565, 0.005250172827665216, 0.005493532455475418, 0.0064690246645603714, 0.00581859783771439, 0.005913783298441542, 0.006306495977016168, 0.004847123500711834 ], "color": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ] }, "x_label": "Metrics", "subplots_adjust": { "left": 0.37, "right": 0.98 } } ], [ "Number of Document of Each Topic in Duplication Bucket 1-1", { "type": "pie", "kwargs": { "x": [ 132249226, 47101525, 108551234, 42778158, 106867576, 52904902, 254436283, 65155001, 160648797, 96650903, 67150855, 409977727, 110689452, 314681138, 6571908, 632953103, 144137883 ], "labels": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "autopct": "%1.1f%%", "colors": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ], "pctdistance": 1.2, "labeldistance": 1.5 } } ], [ "Number of Document of Each Topic in Duplication Bucket 2-5", { "type": "pie", "kwargs": { "x": [ 104341527, 43192514, 75077276, 41770802, 83866134, 49842746, 190845342, 53891858, 130879713, 77844628, 54343851, 284749532, 98624858, 215519319, 6476944, 461711335, 117251313 ], "labels": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "autopct": "%1.1f%%", "colors": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ], "pctdistance": 1.2, "labeldistance": 1.5 } } ], [ "Number of Document of Each Topic in Duplication Bucket 6-10", { "type": "pie", "kwargs": { "x": [ 38443961, 16376927, 24393877, 18121006, 33219823, 22183319, 61039668, 20168641, 46703585, 32303976, 22128963, 101878274, 33435189, 76066340, 2712172, 150381514, 48762315 ], "labels": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "autopct": "%1.1f%%", "colors": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ], "pctdistance": 1.2, "labeldistance": 1.5 } } ], [ "Number of Document of Each Topic in Duplication Bucket 11-100", { "type": "pie", "kwargs": { "x": [ 47907124, 19868534, 24683580, 19990446, 42239133, 23300618, 73293979, 25410832, 51086693, 36943522, 25974730, 115373947, 37581053, 78644079, 3587872, 162190029, 54743888 ], "labels": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "autopct": "%1.1f%%", "colors": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ], "pctdistance": 1.2, "labeldistance": 1.5 } } ], [ "Number of Document of Each Topic in Duplication Bucket 101-1000", { "type": "pie", "kwargs": { "x": [ 1879583, 484835, 792159, 425638, 1281577, 350055, 2198979, 744933, 1141913, 825604, 669062, 2659131, 922347, 1892942, 106472, 3840419, 1189540 ], "labels": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "autopct": "%1.1f%%", "colors": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ], "pctdistance": 1.2, "labeldistance": 1.5 } } ], [ "Number of Document of Each Topic in Duplication Bucket 1001-30000000", { "type": "pie", "kwargs": { "x": [ 31632, 8734, 32929, 8658, 22857, 6434, 57396, 16195, 31926, 20363, 13735, 58310, 21607, 67081, 2647, 145502, 31810 ], "labels": [ "Sports", "Society & Social Issues & Human Rights", "Shopping & Commodity", "Religion & Spirituality", "Politics & Government", "Personal Development & Human Resources & Career", "Natural Science & Formal Science & Technology", "Law & Justice", "Health & Wellness & Medicine", "Food & Drink & Cooking", "Environment", "Entertainment & Travel & Hobby", "Education", "Daily Life & Home & Lifestyle", "Culture & Cultural geography", "Business & Economics & Finance", "Arts" ], "autopct": "%1.1f%%", "colors": [ [ 1.0, 0.4980392156862745, 0.054901960784313725, 1.0 ], [ 1.0, 0.7333333333333333, 0.47058823529411764, 1.0 ], [ 0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0 ], [ 0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0 ], [ 0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0 ], [ 1.0, 0.596078431372549, 0.5882352941176471, 1.0 ], [ 0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0 ], [ 0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0 ], [ 0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0 ], [ 0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0 ], [ 0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0 ], [ 0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0 ], [ 0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0 ], [ 0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0 ], [ 0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0 ], [ 0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0 ], [ 0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0 ] ], "pctdistance": 1.2, "labeldistance": 1.5 } } ] ]