greco commited on
Commit
3cb656a
1 Parent(s): 9735a9a

Revert "update"

Browse files

This reverts commit 9735a9a113298ae9a684f187a906816c063807c8.

.gitattributes CHANGED
@@ -1,2 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  models/distilbart-mnli-12-1/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
2
- models/distilbert-base-uncased-finetuned-sst-2-english/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ models/bertopic_model_tokyo_olympics_tweets filter=lfs diff=lfs merge=lfs -text
29
+ models/bertopic_model_tokyo_olympics_tweets_unclean filter=lfs diff=lfs merge=lfs -text
30
+ models/distilbart-mnli-12-1/flax_model.msgpack filter=lfs diff=lfs merge=lfs -text
31
  models/distilbart-mnli-12-1/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
32
+ models/distilbert-base-uncased-finetuned-sst-2-english/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,5 +1,7 @@
1
  # custom
2
  survey_analytics.ipynb
 
 
3
 
4
  # Byte-compiled / optimized / DLL files
5
  __pycache__/
 
1
  # custom
2
  survey_analytics.ipynb
3
+ embeddings_unclean.pickle
4
+ embeddings.pickle
5
 
6
  # Byte-compiled / optimized / DLL files
7
  __pycache__/
README.md CHANGED
@@ -1,2 +1,13 @@
1
- # survey_analytics
2
- Survey Analytic Techniques
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Survey Analytics
3
+ emoji: 🐨
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: gpl-3.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -9,7 +9,6 @@ import os
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
  import plotly.express as px
12
- import pickle
13
 
14
  # factor analysis
15
  from factor_analyzer import FactorAnalyzer
@@ -20,6 +19,7 @@ from scipy.stats import zscore
20
  # nlp
21
  from bertopic import BERTopic
22
  from transformers import pipeline
 
23
 
24
  # custom
25
  import survey_analytics_library as LIB
@@ -40,10 +40,16 @@ def read_survey_data():
40
  data_survey, data_questions = read_survey_data()
41
 
42
  @st.cache
43
- def read_tokyo_data():
44
  tokyo = pd.read_csv(data_path+'tokyo_olympics_tweets.csv')
45
  return tokyo
46
- tokyo = read_tokyo_data()
 
 
 
 
 
 
47
 
48
  @st.cache(allow_output_mutation=True)
49
  def load_bertopic_model():
@@ -276,9 +282,14 @@ st.write('''
276
  ''')
277
  st.write('\n')
278
 
279
- # load and plot topics using unclean data
280
- with open('data/topics_tokyo_unclean.pickle', 'rb') as pkl:
281
- fig = pickle.load(pkl)
 
 
 
 
 
282
  st.plotly_chart(fig, use_container_width=True)
283
 
284
  st.write('''
@@ -300,9 +311,14 @@ labelled_topics = [
300
  'Vikas Krishan (Indian Boxer)',
301
  ]
302
 
303
- # load plot topics using clean data with stopwords removed
304
- with open('data/topics_tokyo.pickle', 'rb') as pkl:
305
- fig = pickle.load(pkl)
 
 
 
 
 
306
  st.plotly_chart(fig, use_container_width=True)
307
 
308
  st.write('''
@@ -365,7 +381,7 @@ st.write(f'''
365
  An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
366
  Using **Zero-shot Classification**, we can classify responses into one of these four categories.
367
  As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
368
- We'll use a different set of {len(sentiment_results):,} tweets related to climate change.
369
  ''')
370
  st.write('\n')
371
 
@@ -377,12 +393,12 @@ st.dataframe(sentiment_results[['Tweet']])
377
  def load_transfomer_pipelines():
378
  classifier_zero_shot = pipeline(
379
  task='zero-shot-classification',
380
- model='valhalla/distilbart-mnli-12-1',
381
  return_all_scores=True
382
  )
383
  classifier_sentiment = pipeline(
384
  task='sentiment-analysis',
385
- model = 'distilbert-base-uncased-finetuned-sst-2-english',
386
  return_all_scores=True
387
  )
388
  return classifier_zero_shot, classifier_sentiment
 
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
  import plotly.express as px
 
12
 
13
  # factor analysis
14
  from factor_analyzer import FactorAnalyzer
 
19
  # nlp
20
  from bertopic import BERTopic
21
  from transformers import pipeline
22
+ import transformers
23
 
24
  # custom
25
  import survey_analytics_library as LIB
 
40
  data_survey, data_questions = read_survey_data()
41
 
42
  @st.cache
43
+ def read_tweet_data():
44
  tokyo = pd.read_csv(data_path+'tokyo_olympics_tweets.csv')
45
  return tokyo
46
+ tokyo = read_tweet_data()
47
+
48
+ @st.cache(allow_output_mutation=True)
49
+ def load_bertopic_model_unclean():
50
+ topic_model = BERTopic.load(model_path+'bertopic_model_tokyo_olympics_tweets_unclean')
51
+ return topic_model
52
+ topic_model_unclean = load_bertopic_model_unclean()
53
 
54
  @st.cache(allow_output_mutation=True)
55
  def load_bertopic_model():
 
282
  ''')
283
  st.write('\n')
284
 
285
+ # plot topics using unclean data
286
+ fig = LIB.visualize_barchart_titles(
287
+ topic_model=topic_model_unclean,
288
+ subplot_titles=None,
289
+ n_words=5,
290
+ top_n_topics=8,
291
+ height=300
292
+ )
293
  st.plotly_chart(fig, use_container_width=True)
294
 
295
  st.write('''
 
311
  'Vikas Krishan (Indian Boxer)',
312
  ]
313
 
314
+ # plot topics using clean data with stopwords removed
315
+ fig = LIB.visualize_barchart_titles(
316
+ topic_model=topic_model,
317
+ subplot_titles=labelled_topics,
318
+ n_words=5,
319
+ top_n_topics=8,
320
+ height=300
321
+ )
322
  st.plotly_chart(fig, use_container_width=True)
323
 
324
  st.write('''
 
381
  An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
382
  Using **Zero-shot Classification**, we can classify responses into one of these four categories.
383
  As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
384
+ We'll use a different set of 10,000 tweets related to climate change.
385
  ''')
386
  st.write('\n')
387
 
 
393
  def load_transfomer_pipelines():
394
  classifier_zero_shot = pipeline(
395
  task='zero-shot-classification',
396
+ model=model_path+'distilbart-mnli-12-1',
397
  return_all_scores=True
398
  )
399
  classifier_sentiment = pipeline(
400
  task='sentiment-analysis',
401
+ model=model_path+'distilbert-base-uncased-finetuned-sst-2-english',
402
  return_all_scores=True
403
  )
404
  return classifier_zero_shot, classifier_sentiment
data/tokyo_topics.csv DELETED
@@ -1,131 +0,0 @@
1
- Topic,Count,Name
2
- -1,2624,-1_silver_medal_proud_mirabaichanu
3
- 0,428,0_banda_zambia_barbra_barbra banda
4
- 1,356,1_india_proud_indians_moment
5
- 2,296,2_sutirtha_mukherjee_sutirtha mukherjee_tabletennis
6
- 3,287,3_mirabaichanu hearty_lifting womens_mirabaichanu lifting_hearty congratulations
7
- 4,248,4_race_road_road race_carapaz
8
- 5,210,5_japan_volleyball_venezuela_flag
9
- 6,195,6_kerr_sam_sam kerr_matildas
10
- 7,183,7_vikas_boxing_krishan_vikas krishan
11
- 8,163,8_gymnastics_mens gymnastics_max_whitlock
12
- 9,148,9_tennis_murray_singles_nagal
13
- 10,121,10_bbc_coverage_live_bbcsport
14
- 11,120,11_ina_facebook_action_officialvkyadav
15
- 12,115,12_puneethrajkumar cheer4india_dvitva james_dvitva_james puneethrajkumar
16
- 13,113,13_hockey_south africa_gbhockey_africa
17
- 14,100,14_judo_takato_gold_japans
18
- 15,97,15_chanu_mirabai chanu_chanu wins_mirabai
19
- 16,92,16_swimming_swimming swimming_aquatics_finals
20
- 17,89,17_medal weightlifting_mirabaichanu winning_ace indian_congratulations ace
21
- 18,87,18_q2_canwnt_corner_follow live
22
- 19,85,19_winning medal_indias medal_medal india_medal
23
- 20,84,20_basketball_3x3_3x3 basketball_usa
24
- 21,80,21_butterfly_100m_heat_100m butterfly
25
- 22,78,22_weightlifter_weightlifter mirabai_chanu_mirabai chanu
26
- 23,78,23_kosovo_distria_krasniqi_distria krasniqi
27
- 24,77,24_swevaus_damn_furniture_swevaus fuck
28
- 25,75,25_yulo_carlos_carlos yulo_rings
29
- 26,71,26_ceremony_opening ceremony_opening_drones
30
- 27,69,27_medal ongoing_ongoing_winning indias_indias medal
31
- 28,64,28_teamgb_gb_come_team gb
32
- 29,62,29_sweden_swedes_swevaus_swedish
33
- 30,61,30_sweden_australia_rolfo_fridolina
34
- 31,60,31_japan_britain_great britain_japan great
35
- 32,59,32_rule_remedy_remedy rule_butterfly
36
- 33,55,33_silver medal_winning silver_silver_mirabaichanu winning
37
- 34,52,34_mirabaichanu proud_proud_proud mirabaichanu_mirabaichanu
38
- 35,51,35_chile_canada_beckie_janine
39
- 36,51,36_mediasai_virenrasquinha imrahultrehan_iosindiaoff virenrasquinha_iosindiaoff
40
- 37,49,37_clareburt_lewis_lewis clareburt_kalisz
41
- 38,49,38_dressage_equestrian_horse_equestrian dressage
42
- 39,47,39_mirabaichanu wins_49kg category_category_india snatches
43
- 40,47,40_imrahultrehan congratulations_mirabaichanu mediasai_railminindia_iosindiaoff
44
- 41,47,41_silver medal_tally_silver_medals tally
45
- 42,47,42_penalty_penalty china_ref_referee
46
- 43,45,43_country proud_medal country_country_winning silver
47
- 44,45,44_teammalaysia_teamindia_teamina_congrats teammalaysia
48
- 45,44,45_daddies_badminton_daddies badminton_ina
49
- 46,44,46_chirag_rankireddy_shetty_chirag shetty
50
- 47,44,47_countrys medal_bringing glory_glory medal_countrys
51
- 48,43,48_medals_1001_medals won_1001 1001
52
- 49,43,49_badminton_badmintonmalaysia_ina_wooi yik
53
- 50,42,50_achieving medal_mirabaichanu achieving_achieving_medal india
54
- 51,41,51_badminton_malaysia_double_sokongmalaysia
55
- 52,41,52_sleep_saturday_hours_watch
56
- 53,41,53_cheer4india_teamindia_da boys_teamindia best
57
- 54,40,54_sweaus_sweaus football_swe_aus
58
- 55,40,55_pistol_10m_air pistol_air
59
- 56,39,56_medal weightlifting_winning silver_weightlifting_silver medal
60
- 57,38,57_silver india_silver_india_mirabaichanu silver
61
- 58,37,58_flying start_flying_start huge_huge congratulations
62
- 59,36,59_archery_mixed team_korea_mixed
63
- 60,35,60_covid19_covid_paralympics_test
64
- 61,35,61_athletes_olympians_proud athletes_congratulations joebrier99
65
- 62,35,62_penalty_swevaus_penalty swevaus_swevaus penalty
66
- 63,35,63_pakistan_uae_athletes_afghanistan
67
- 64,34,64_asked_asked happier_india elated_happier start
68
- 65,34,65_smith_brendon_brendon smith_swim
69
- 66,33,66_matildas_sweden_matildas sweden_attacking
70
- 67,32,67_mirabaichanu cheer4india_cheer4india_cheer4india mirabaichanu_mirabaichanu congratulations
71
- 68,32,68_day mirabaichanu_indias 1st_medal day_weightlifting india
72
- 69,31,69_boxing_boxers_welterweights_delante
73
- 70,31,70_loving_let party_officially held_waiting gymnastics
74
- 71,31,71_400m_mens 400m_heat_400
75
- 72,30,72_malaysia_malaysiaboleh_malaysiaboleh congrats_malaysia malaysia
76
- 73,30,73_time india_india clinches_clinches medal_day hearty
77
- 74,30,74_silver medal_medal india_silver_india
78
- 75,30,75_mirabai chanu_mirabai_chanu_saikhom mirabai
79
- 76,30,76_football_womens football_soccer_women
80
- 77,30,77_mcgrail_peter mcgrail_peter_butdee
81
- 78,29,78_display weightlifting_amazing display_absolutely amazing_display
82
- 79,29,79_cheer4india_medal cheer4india_indias mirabaichanu_medal medal
83
- 80,29,80_mirabaichanu teamindia_teamindia_teamindia mirabaichanu_proud teamindia
84
- 81,29,81_spain_waterpolo_water polo_polo
85
- 82,29,82_daddies_daddies daddies_daddies victory_mantap daddies
86
- 83,28,83_pen_pen swevaus_swevaus pen_swevaus
87
- 84,28,84_mirabaichanu mirabaichanu_mirabaichanu_congratulations mirabaichanu_power
88
- 85,27,85_congratulations mirabai_chanu winning_mirabai chanu_49 kg
89
- 86,27,86_silver weightlifting_huge congratulations_huge_winning silver
90
- 87,27,87_qian_yang qian_yang_chinas
91
- 88,27,88_medal womens_category_49kg_winning silver
92
- 89,27,89_potential_massive potential_long term_term quick
93
- 90,26,90_matildas_fark_pen matildas_matildas matildas
94
- 91,26,91_grande_carapaz_hispanos_grande carapaz
95
- 92,25,92_gift selflove_mensfashion_selflove_selfie mensfashion
96
- 93,24,93_matildas_matildas swevaus_swevaus_swevaus matildas
97
- 94,24,94_49_womens 49_49 kgs_kgs
98
- 95,24,95_thematildas_goaustralia_thematildas samkerr1_goaustralia thematildas
99
- 96,23,96_new zealand_zealand_hockey_new
100
- 97,23,97_chanu secured_secured medal_secured_country winning
101
- 98,23,98_weightlifting lets_lets cheer_cheer india_cheer
102
- 99,23,99_raymondcupid kyereminator_kyereminator daterush_watch hisbella4_kyereminator
103
- 100,22,100_mirabaichanu silver_silver_silver mirabaichanu_mam silver
104
- 101,22,101_nigeria_ghana_team_ghanas
105
- 102,22,102_aus_ausvswe_aussies_australia
106
- 103,22,103_winning silver_mirabaichanu winning_weightlifting medal_silver weightlifting
107
- 104,22,104_teamindia 49kg_silver medal_mirabaichanu won_medal radiant
108
- 105,21,105_swimming_bbcsport_bbc_swimming heats
109
- 106,21,106_mirabaichanu weightlifting_weightlifting_india mirabaichanu_india
110
- 107,21,107_mirabaichanu weightlifting_weightlifting_spirits_bow
111
- 108,21,108_history mirabai_teamindia mirabaichanu_chanu won_medal teamindia
112
- 109,21,109_giochiolimpici_forzaazzurri_olimpiadi forzaazzurri_olimpiadi
113
- 110,21,110_handball_portugal_egypt_esp
114
- 111,21,111_seto_daiya_daiya seto_shock
115
- 112,21,112_congratulations mirabai_chanu winning_chanu_mirabai
116
- 113,21,113_brazil_netherlands_netherlands brazil_brazil womens
117
- 114,20,114_mohanlal_mirabaichanu congratulations_mohanlal mirabaichanu_winning indias
118
- 115,20,115_day congratulations_congratulations saikhom_49kg weightlift_weightlift
119
- 116,20,116_saikhom_saikhom mirabai_congratulations saikhom_chanu winning
120
- 117,20,117_dreams_criticism blood_sacrifice_criticism
121
- 118,20,118_peaty_adam_adam peaty_adampeaty
122
- 119,19,119_actor_medal winner_favourite actor_winner mirabaichanu
123
- 120,19,120_peng_ying_chan peng_chan
124
- 121,19,121_taekwondo_jin_barbosa_kurt
125
- 122,18,122_fencing_samele fencing_samele_2nd round
126
- 123,18,123_congratulated winning_mirabaichanu congratulated_congratulated_spoke
127
- 124,18,124_strikes medal_india strikes_strikes_medal 49
128
- 125,17,125_mirabaichanu comes_comes india_mohanlal mirabaichanu_mohanlal
129
- 126,15,126_carrying_moment_proud moment_proud
130
- 127,15,127_cheer4india_country cheer4india_teamindia_medal teamindia
131
- 128,15,128_medal mirabai_kg womens_medal 49_womens weightlifting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/topics_tokyo.pickle DELETED
Binary file (9.44 kB)
 
data/topics_tokyo_unclean.pickle DELETED
Binary file (9.34 kB)
 
models/bertopic_model_tokyo_olympics_tweets_unclean ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0de856ed231c12e7baeaff15eb3159e1a5ef7c5512b459f915f46712f6d203a3
3
+ size 71961846
models/distilbart-mnli-12-1/README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - mnli
4
+ tags:
5
+ - distilbart
6
+ - distilbart-mnli
7
+ pipeline_tag: zero-shot-classification
8
+ ---
9
+
10
+ # DistilBart-MNLI
11
+
12
+ distilbart-mnli is the distilled version of bart-large-mnli created using the **No Teacher Distillation** technique proposed for BART summarisation by Huggingface, [here](https://github.com/huggingface/transformers/tree/master/examples/seq2seq#distilbart).
13
+
14
+ We just copy alternating layers from `bart-large-mnli` and finetune more on the same data.
15
+
16
+
17
+ | | matched acc | mismatched acc |
18
+ | ------------------------------------------------------------------------------------ | ----------- | -------------- |
19
+ | [bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli) (baseline, 12-12) | 89.9 | 90.01 |
20
+ | [distilbart-mnli-12-1](https://huggingface.co/valhalla/distilbart-mnli-12-1) | 87.08 | 87.5 |
21
+ | [distilbart-mnli-12-3](https://huggingface.co/valhalla/distilbart-mnli-12-3) | 88.1 | 88.19 |
22
+ | [distilbart-mnli-12-6](https://huggingface.co/valhalla/distilbart-mnli-12-6) | 89.19 | 89.01 |
23
+ | [distilbart-mnli-12-9](https://huggingface.co/valhalla/distilbart-mnli-12-9) | 89.56 | 89.52 |
24
+
25
+
26
+ This is a very simple and effective technique, as we can see the performance drop is very little.
27
+
28
+ Detailed performace trade-offs will be posted in this [sheet](https://docs.google.com/spreadsheets/d/1dQeUvAKpScLuhDV1afaPJRRAE55s2LpIzDVA5xfqxvk/edit?usp=sharing).
29
+
30
+
31
+ ## Fine-tuning
32
+ If you want to train these models yourself, clone the [distillbart-mnli repo](https://github.com/patil-suraj/distillbart-mnli) and follow the steps below
33
+
34
+ Clone and install transformers from source
35
+ ```bash
36
+ git clone https://github.com/huggingface/transformers.git
37
+ pip install -qqq -U ./transformers
38
+ ```
39
+
40
+ Download MNLI data
41
+ ```bash
42
+ python transformers/utils/download_glue_data.py --data_dir glue_data --tasks MNLI
43
+ ```
44
+
45
+ Create student model
46
+ ```bash
47
+ python create_student.py \
48
+ --teacher_model_name_or_path facebook/bart-large-mnli \
49
+ --student_encoder_layers 12 \
50
+ --student_decoder_layers 6 \
51
+ --save_path student-bart-mnli-12-6 \
52
+ ```
53
+
54
+ Start fine-tuning
55
+ ```bash
56
+ python run_glue.py args.json
57
+ ```
58
+
59
+ You can find the logs of these trained models in this [wandb project](https://wandb.ai/psuraj/distilbart-mnli).
models/distilbart-mnli-12-1/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 3,
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForSequenceClassification"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 1,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 12,
25
+ "eos_token_id": 2,
26
+ "extra_pos_embeddings": 2,
27
+ "finetuning_task": "mnli",
28
+ "force_bos_token_to_be_generated": false,
29
+ "forced_eos_token_id": 2,
30
+ "gradient_checkpointing": false,
31
+ "id2label": {
32
+ "0": "contradiction",
33
+ "1": "neutral",
34
+ "2": "entailment"
35
+ },
36
+ "init_std": 0.02,
37
+ "is_encoder_decoder": true,
38
+ "label2id": {
39
+ "contradiction": 0,
40
+ "entailment": 2,
41
+ "neutral": 1
42
+ },
43
+ "max_position_embeddings": 1024,
44
+ "model_type": "bart",
45
+ "normalize_before": false,
46
+ "normalize_embedding": true,
47
+ "num_hidden_layers": 12,
48
+ "output_past": false,
49
+ "pad_token_id": 1,
50
+ "scale_embedding": false,
51
+ "static_position_embeddings": false,
52
+ "total_flos": 153130534133111808,
53
+ "transformers_version": "4.7.0.dev0",
54
+ "use_cache": true,
55
+ "vocab_size": 50265
56
+ }
models/distilbart-mnli-12-1/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/distilbart-mnli-12-1/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa79ff59084a5036b07a9cffeaa1b1b7c1aa5edeb1885416a734c001a09aa046
3
+ size 890410947
models/distilbart-mnli-12-1/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
models/distilbart-mnli-12-1/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_max_length": 1024}
models/distilbart-mnli-12-1/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/distilbert-base-uncased-finetuned-sst-2-english/README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ datasets:
5
+ - sst-2
6
+ ---
7
+
8
+ # DistilBERT base uncased finetuned SST-2
9
+
10
+ This model is a fine-tune checkpoint of [DistilBERT-base-uncased](https://huggingface.co/distilbert-base-uncased), fine-tuned on SST-2.
11
+ This model reaches an accuracy of 91.3 on the dev set (for comparison, Bert bert-base-uncased version reaches an accuracy of 92.7).
12
+
13
+ For more details about DistilBERT, we encourage users to check out [this model card](https://huggingface.co/distilbert-base-uncased).
14
+
15
+ # Fine-tuning hyper-parameters
16
+
17
+ - learning_rate = 1e-5
18
+ - batch_size = 32
19
+ - warmup = 600
20
+ - max_seq_length = 128
21
+ - num_train_epochs = 3.0
22
+
23
+ # Bias
24
+
25
+ Based on a few experimentations, we observed that this model could produce biased predictions that target underrepresented populations.
26
+
27
+ For instance, for sentences like `This film was filmed in COUNTRY`, this binary classification model will give radically different probabilities for the positive label depending on the country (0.89 if the country is France, but 0.08 if the country is Afghanistan) when nothing in the input indicates such a strong semantic shift. In this [colab](https://colab.research.google.com/gist/ageron/fb2f64fb145b4bc7c49efc97e5f114d3/biasmap.ipynb), [Aurélien Géron](https://twitter.com/aureliengeron) made an interesting map plotting these probabilities for each country.
28
+
29
+ <img src="https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/map.jpeg" alt="Map of positive probabilities per country." width="500"/>
30
+
31
+ We strongly advise users to thoroughly probe these aspects on their use-cases in order to evaluate the risks of this model. We recommend looking at the following bias evaluation datasets as a place to start: [WinoBias](https://huggingface.co/datasets/wino_bias), [WinoGender](https://huggingface.co/datasets/super_glue), [Stereoset](https://huggingface.co/datasets/stereoset).
models/distilbert-base-uncased-finetuned-sst-2-english/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForSequenceClassification"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "dim": 768,
8
+ "dropout": 0.1,
9
+ "finetuning_task": "sst-2",
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "NEGATIVE",
13
+ "1": "POSITIVE"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "NEGATIVE": 0,
18
+ "POSITIVE": 1
19
+ },
20
+ "max_position_embeddings": 512,
21
+ "model_type": "distilbert",
22
+ "n_heads": 12,
23
+ "n_layers": 6,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "qa_dropout": 0.1,
27
+ "seq_classif_dropout": 0.2,
28
+ "sinusoidal_pos_embds": false,
29
+ "tie_weights_": true,
30
+ "vocab_size": 30522
31
+ }
models/distilbert-base-uncased-finetuned-sst-2-english/map.jpeg ADDED
models/distilbert-base-uncased-finetuned-sst-2-english/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60554cbd7781b09d87f1ececbea8c064b94e49a7f03fd88e8775bfe6cc3d9f88
3
+ size 267844284
models/distilbert-base-uncased-finetuned-sst-2-english/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_max_length": 512, "do_lower_case": true}
models/distilbert-base-uncased-finetuned-sst-2-english/vocab.txt ADDED
The diff for this file is too large to render. See raw diff